cbeta 1.1.16 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta/html_to_pdf.rb +92 -0
- data/lib/cbeta/p5a_to_html.rb +5 -1
- data/lib/cbeta/p5a_to_html_for_pdf.rb +585 -0
- data/lib/cbeta.rb +4 -0
- data/lib/data/html-for-pdf.css +69 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 40faf97b4f5c3b0070145cefbdf86fe7eae39b72
|
4
|
+
data.tar.gz: ebe45fbfc2bf8eb3e025754234ec7e23f6107c49
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2d8bf5130fa58d65fe550c3855ba25b46714871e7e1abfe14d6b8546d3a2ca26e70ef1b23871cfc628b81f357fc83a84177a93a160ead4f6d75dd041c11ccd17
|
7
|
+
data.tar.gz: cc189891bd08585916c2fe6fb0c645d280558a1f214443a0116b5d9595707a6586323c37d5d886a25bbb3b83357e51fba71ccb8505ce053e543c1d7659225135
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'wicked_pdf'
|
2
|
+
|
3
|
+
class CBETA::HTMLToPDF
|
4
|
+
# @param input [String] folder of source HTML, HTML can be produced by CBETA::P5aToHTMLForPDF.
|
5
|
+
# @param output [String] output folder
|
6
|
+
def initialize(input, output)
|
7
|
+
@input = input
|
8
|
+
@output = output
|
9
|
+
end
|
10
|
+
|
11
|
+
# Convert CBETA HTML to PDF
|
12
|
+
#
|
13
|
+
# @example for convert Taisho (大正藏) Volumn 1:
|
14
|
+
#
|
15
|
+
# c = CBETA::HTMLToPDF.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
16
|
+
# c.convert('T01')
|
17
|
+
#
|
18
|
+
# @example for convert all in Taisho (大正藏):
|
19
|
+
#
|
20
|
+
# c = CBETA::HTMLToPDF.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
21
|
+
# c.convert('T')
|
22
|
+
#
|
23
|
+
# @example for convert Taisho Vol. 5~7:
|
24
|
+
#
|
25
|
+
# c = CBETA::P5aToHTMLForPDF.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
26
|
+
# c.convert('T05..T07')
|
27
|
+
#
|
28
|
+
# T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
|
29
|
+
def convert(target=nil)
|
30
|
+
return convert_all if target.nil?
|
31
|
+
|
32
|
+
arg = target.upcase
|
33
|
+
if arg.size == 1
|
34
|
+
convert_collection(arg)
|
35
|
+
else
|
36
|
+
if arg.include? '..'
|
37
|
+
arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
|
38
|
+
convert_vols($1, $2)
|
39
|
+
}
|
40
|
+
else
|
41
|
+
convert_vol(arg)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def handle_collection(c)
|
47
|
+
@series = c
|
48
|
+
puts 'handle_collection ' + c
|
49
|
+
folder = File.join(@input, @series)
|
50
|
+
Dir.foreach(folder) { |vol|
|
51
|
+
next if ['.', '..', '.DS_Store'].include? vol
|
52
|
+
convert_vol(vol)
|
53
|
+
}
|
54
|
+
end
|
55
|
+
|
56
|
+
def convert_file(html_fn, pdf_fn)
|
57
|
+
puts "convert file: #{html_fn} to #{pdf_fn}"
|
58
|
+
pdf = WickedPdf.new.pdf_from_html_file(html_fn)
|
59
|
+
|
60
|
+
File.open(pdf_fn, 'wb') do |file|
|
61
|
+
file << pdf
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def convert_vol(arg)
|
66
|
+
vol = arg.upcase
|
67
|
+
canon = vol[0]
|
68
|
+
vol_folder = File.join(@input, canon, vol)
|
69
|
+
|
70
|
+
output_folder = File.join(@output, canon, vol)
|
71
|
+
FileUtils.mkdir_p(output_folder) unless Dir.exist? output_folder
|
72
|
+
|
73
|
+
Dir.entries(vol_folder).sort.each do |f|
|
74
|
+
next if f.start_with? '.'
|
75
|
+
src = File.join(vol_folder, f, 'main.htm')
|
76
|
+
dest = File.join(output_folder, "#{f}.pdf")
|
77
|
+
convert_file(src, dest)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def convert_vols(v1, v2)
|
82
|
+
puts "convert volumns: #{v1}..#{v2}"
|
83
|
+
@series = v1[0]
|
84
|
+
folder = File.join(@input, @series)
|
85
|
+
Dir.foreach(folder) { |vol|
|
86
|
+
next if vol < v1
|
87
|
+
next if vol > v2
|
88
|
+
convert_vol(vol)
|
89
|
+
}
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
data/lib/cbeta/p5a_to_html.rb
CHANGED
@@ -498,7 +498,11 @@ class CBETA::P5aToHTML
|
|
498
498
|
end
|
499
499
|
|
500
500
|
def handle_p(e)
|
501
|
-
|
501
|
+
if e.key? 'type'
|
502
|
+
r = "<p class='%s'>" % e['type']
|
503
|
+
else
|
504
|
+
r = '<p>'
|
505
|
+
end
|
502
506
|
r += "<span class='lineInfo' line='#{@lb}'></span>"
|
503
507
|
r += traverse(e)
|
504
508
|
r + '</p>'
|
@@ -0,0 +1,585 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'date'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'json'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
# Convert CBETA XML P5a to HTML for PDF
|
9
|
+
#
|
10
|
+
# You can get CBETA XML P5a from: https://github.com/cbeta-git/xml-p5a
|
11
|
+
class CBETA::P5aToHTMLForPDF
|
12
|
+
# 內容不輸出的元素
|
13
|
+
PASS=['back', 'teiHeader']
|
14
|
+
|
15
|
+
# 某版用字缺的符號
|
16
|
+
MISSING = '-'
|
17
|
+
|
18
|
+
private_constant :PASS, :MISSING
|
19
|
+
|
20
|
+
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
21
|
+
# @param out_root [String] 輸出 HTML 路徑
|
22
|
+
# @option opts [String] :graphic_base folder of graphics
|
23
|
+
# * graphic_base/figures: 插圖圖檔位置
|
24
|
+
# * graphic_base/sd-gif: images for Siddham (悉曇字)
|
25
|
+
# * graphic_base/rj-gif: images for Ranjana (蘭札體)
|
26
|
+
def initialize(xml_root, out_root, opts={})
|
27
|
+
@config = {
|
28
|
+
}
|
29
|
+
@config.merge!(opts)
|
30
|
+
|
31
|
+
@xml_root = xml_root
|
32
|
+
@out_root = out_root
|
33
|
+
@cbeta = CBETA.new
|
34
|
+
@gaijis = CBETA::Gaiji.new
|
35
|
+
end
|
36
|
+
|
37
|
+
# 將 CBETA XML P5a 轉為 HTML 供轉為 PDF
|
38
|
+
#
|
39
|
+
# @example for convert 大正藏第一冊:
|
40
|
+
#
|
41
|
+
# c = CBETA::P5aToHTMLForPDF.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
42
|
+
# c.convert('T01')
|
43
|
+
#
|
44
|
+
# @example for convert 大正藏全部:
|
45
|
+
#
|
46
|
+
# c = CBETA::P5aToHTMLForPDF.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
47
|
+
# c.convert('T')
|
48
|
+
#
|
49
|
+
# @example for convert 大正藏第五冊至第七冊:
|
50
|
+
#
|
51
|
+
# c = CBETA::P5aToHTMLForPDF.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
52
|
+
# c.convert('T05..T07')
|
53
|
+
#
|
54
|
+
# T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
|
55
|
+
def convert(target=nil)
|
56
|
+
return convert_all if target.nil?
|
57
|
+
|
58
|
+
arg = target.upcase
|
59
|
+
if arg.size == 1
|
60
|
+
handle_collection(arg)
|
61
|
+
else
|
62
|
+
if arg.include? '..'
|
63
|
+
arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
|
64
|
+
handle_vols($1, $2)
|
65
|
+
}
|
66
|
+
else
|
67
|
+
handle_vol(arg)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def convert_all
|
75
|
+
Dir.foreach(@xml_root) { |c|
|
76
|
+
next unless c.match(/^[A-Z]$/)
|
77
|
+
handle_collection(c)
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
def handle_anchor(e)
|
82
|
+
id = e['id']
|
83
|
+
|
84
|
+
if e.has_attribute?('type')
|
85
|
+
if e['type'] == 'circle'
|
86
|
+
return '◎'
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
''
|
91
|
+
end
|
92
|
+
|
93
|
+
def handle_app(e)
|
94
|
+
traverse(e)
|
95
|
+
end
|
96
|
+
|
97
|
+
def handle_byline(e)
|
98
|
+
s = traverse(e)
|
99
|
+
"<p class='byline'>#{s}</p>"
|
100
|
+
end
|
101
|
+
|
102
|
+
def handle_cell(e)
|
103
|
+
doc = Nokogiri::XML::Document.new
|
104
|
+
cell = doc.create_element('td')
|
105
|
+
cell['rowspan'] = e['rows'] if e.key? 'rows'
|
106
|
+
cell['colspan'] = e['cols'] if e.key? 'cols'
|
107
|
+
cell.inner_html = traverse(e)
|
108
|
+
to_html(cell)
|
109
|
+
end
|
110
|
+
|
111
|
+
def handle_collection(c)
|
112
|
+
@series = c
|
113
|
+
puts 'handle_collection ' + c
|
114
|
+
folder = File.join(@xml_root, @series)
|
115
|
+
Dir.foreach(folder) { |vol|
|
116
|
+
next if ['.', '..', '.DS_Store'].include? vol
|
117
|
+
handle_vol(vol)
|
118
|
+
}
|
119
|
+
end
|
120
|
+
|
121
|
+
def handle_corr(e)
|
122
|
+
traverse(e)
|
123
|
+
end
|
124
|
+
|
125
|
+
def handle_div(e)
|
126
|
+
if e.has_attribute? 'type'
|
127
|
+
@open_divs << e
|
128
|
+
r = traverse(e)
|
129
|
+
@open_divs.pop
|
130
|
+
return "<div>#{r}</div>"
|
131
|
+
else
|
132
|
+
return traverse(e)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def handle_figure(e)
|
137
|
+
"<div class='figure'>%s</div>" % traverse(e)
|
138
|
+
end
|
139
|
+
|
140
|
+
def handle_g(e, mode)
|
141
|
+
# if 有 <mapping type="unicode">
|
142
|
+
# if 不在 Unicode Extension C, D, E 範圍裡
|
143
|
+
# 直接採用
|
144
|
+
# else
|
145
|
+
# 預設呈現 unicode, 但仍包缺字資訊,供點選開 popup
|
146
|
+
# else if 有 <mapping type="normal_unicode">
|
147
|
+
# 預設呈現 normal_unicode, 但仍包缺字資訊,供點選開 popup
|
148
|
+
# else if 有 normalized form
|
149
|
+
# 預設呈現 normalized form, 但仍包缺字資訊,供點選開 popup
|
150
|
+
# else
|
151
|
+
# 預設呈現組字式, 但仍包缺字資訊,供點選開 popup
|
152
|
+
gid = e['ref'][1..-1]
|
153
|
+
g = @gaijis[gid]
|
154
|
+
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
155
|
+
zzs = g['zzs']
|
156
|
+
|
157
|
+
if mode == 'txt'
|
158
|
+
return g['roman'] if gid.start_with?('SD')
|
159
|
+
if zzs.nil?
|
160
|
+
abort "缺組字式:#{g}"
|
161
|
+
else
|
162
|
+
return zzs
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
@char_count += 1
|
167
|
+
|
168
|
+
if gid.start_with?('SD')
|
169
|
+
case gid
|
170
|
+
when 'SD-E35A'
|
171
|
+
return '('
|
172
|
+
when 'SD-E35B'
|
173
|
+
return ')'
|
174
|
+
else
|
175
|
+
fn = "#{gid}.gif"
|
176
|
+
src = File.join(@config[:graphic_base], 'sd-gif', gid[3..4], fn)
|
177
|
+
dest = File.join(@output_folder_sutra, fn)
|
178
|
+
FileUtils.copy(src, dest)
|
179
|
+
return "<img src='#{fn}'/>"
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
if gid.start_with?('RJ')
|
184
|
+
fn = "#{gid}.gif"
|
185
|
+
src = File.join(@config[:graphic_base], 'rj-gif', gid[3..4], fn)
|
186
|
+
dest = File.join(@output_folder_sutra, fn)
|
187
|
+
return "<img src='#{fn}'/>"
|
188
|
+
end
|
189
|
+
|
190
|
+
if g.has_key?('unicode')
|
191
|
+
if @unicode1.include?(g['unicode'])
|
192
|
+
return g['unicode-char'] # 直接採用 unicode
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
return g['normal_unicode'] if g.has_key?('normal_unicode')
|
197
|
+
return g['normal'] if g.has_key?('normal')
|
198
|
+
|
199
|
+
zzs
|
200
|
+
end
|
201
|
+
|
202
|
+
def handle_graphic(e)
|
203
|
+
url = e['url']
|
204
|
+
url.sub!(/^.*(figures\/.*)$/, '\1')
|
205
|
+
|
206
|
+
src = File.join(@config[:graphic_base], url)
|
207
|
+
fn = File.basename(src)
|
208
|
+
dest = File.join(@output_folder_sutra, fn)
|
209
|
+
FileUtils.copy(src, dest)
|
210
|
+
"<img src='#{fn}'/>"
|
211
|
+
end
|
212
|
+
|
213
|
+
def handle_head(e)
|
214
|
+
if e['type'] == 'added'
|
215
|
+
return ''
|
216
|
+
elsif e.parent.name == 'list'
|
217
|
+
return traverse(e)
|
218
|
+
else
|
219
|
+
i = @open_divs.size
|
220
|
+
return "<p class='h#{i}'>%s</p>" % traverse(e)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def handle_item(e)
|
225
|
+
"<li>%s</li>\n" % traverse(e)
|
226
|
+
end
|
227
|
+
|
228
|
+
def handle_juan(e)
|
229
|
+
"<p class='juan'>%s</p>" % traverse(e)
|
230
|
+
end
|
231
|
+
|
232
|
+
def handle_l(e)
|
233
|
+
if @lg_type == 'abnormal'
|
234
|
+
return traverse(e)
|
235
|
+
end
|
236
|
+
|
237
|
+
@in_l = true
|
238
|
+
|
239
|
+
doc = Nokogiri::XML::Document.new
|
240
|
+
cell = doc.create_element('div')
|
241
|
+
cell['class'] = 'lg-cell'
|
242
|
+
cell.inner_html = traverse(e)
|
243
|
+
|
244
|
+
if @first_l
|
245
|
+
parent = e.parent()
|
246
|
+
if parent.has_attribute?('rend')
|
247
|
+
indent = parent['rend'].scan(/text-indent:[^:]*/)
|
248
|
+
unless indent.empty?
|
249
|
+
cell['style'] = indent[0]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
@first_l = false
|
253
|
+
end
|
254
|
+
r = to_html(cell)
|
255
|
+
|
256
|
+
unless @lg_row_open
|
257
|
+
r = "\n<div class='lg-row'>" + r
|
258
|
+
@lg_row_open = true
|
259
|
+
end
|
260
|
+
@in_l = false
|
261
|
+
r
|
262
|
+
end
|
263
|
+
|
264
|
+
def handle_lb(e)
|
265
|
+
# 卍續藏有 X 跟 R 兩種 lb, 只處理 X
|
266
|
+
return '' if e['ed'] != @series
|
267
|
+
|
268
|
+
r = ''
|
269
|
+
if @lg_row_open && !@in_l
|
270
|
+
# 每行偈頌放在一個 lg-row 裡面
|
271
|
+
# T46n1937, p. 914a01, l 包雙行夾註跨行
|
272
|
+
# T20n1092, 337c16, lb 在 l 中間,不結束 lg-row
|
273
|
+
r += "</div><!-- end of lg-row -->"
|
274
|
+
@lg_row_open = false
|
275
|
+
end
|
276
|
+
unless @t_buf1.empty? and @t_buf2.empty?
|
277
|
+
r += print_tt
|
278
|
+
end
|
279
|
+
r
|
280
|
+
end
|
281
|
+
|
282
|
+
def handle_lem(e)
|
283
|
+
traverse(e)
|
284
|
+
end
|
285
|
+
|
286
|
+
def handle_lg(e)
|
287
|
+
r = ''
|
288
|
+
@lg_type = e['type']
|
289
|
+
if @lg_type == 'abnormal'
|
290
|
+
r = "<p class='lg-abnormal'>" + traverse(e) + "</p>"
|
291
|
+
else
|
292
|
+
@first_l = true
|
293
|
+
doc = Nokogiri::XML::Document.new
|
294
|
+
node = doc.create_element('div')
|
295
|
+
node['class'] = 'lg'
|
296
|
+
if e.has_attribute?('rend')
|
297
|
+
rend = e['rend'].gsub(/text-indent:[^:]*/, '')
|
298
|
+
node['style'] = rend
|
299
|
+
end
|
300
|
+
@lg_row_open = false
|
301
|
+
node.inner_html = traverse(e)
|
302
|
+
if @lg_row_open
|
303
|
+
node.inner_html += '</div><!-- end of lg -->'
|
304
|
+
@lg_row_open = false
|
305
|
+
end
|
306
|
+
r = "\n" + to_html(node)
|
307
|
+
end
|
308
|
+
r
|
309
|
+
end
|
310
|
+
|
311
|
+
def handle_list(e)
|
312
|
+
"<ul>%s</ul>" % traverse(e)
|
313
|
+
end
|
314
|
+
|
315
|
+
def handle_milestone(e)
|
316
|
+
''
|
317
|
+
end
|
318
|
+
|
319
|
+
def handle_mulu(e)
|
320
|
+
''
|
321
|
+
end
|
322
|
+
|
323
|
+
def handle_node(e, mode)
|
324
|
+
return '' if e.comment?
|
325
|
+
return handle_text(e, mode) if e.text?
|
326
|
+
return '' if PASS.include?(e.name)
|
327
|
+
r = case e.name
|
328
|
+
when 'anchor' then handle_anchor(e)
|
329
|
+
when 'app' then handle_app(e)
|
330
|
+
when 'byline' then handle_byline(e)
|
331
|
+
when 'cell' then handle_cell(e)
|
332
|
+
when 'corr' then handle_corr(e)
|
333
|
+
when 'div' then handle_div(e)
|
334
|
+
when 'figure' then handle_figure(e)
|
335
|
+
when 'foreign' then ''
|
336
|
+
when 'g' then handle_g(e, mode)
|
337
|
+
when 'graphic' then handle_graphic(e)
|
338
|
+
when 'head' then handle_head(e)
|
339
|
+
when 'item' then handle_item(e)
|
340
|
+
when 'juan' then handle_juan(e)
|
341
|
+
when 'l' then handle_l(e)
|
342
|
+
when 'lb' then handle_lb(e)
|
343
|
+
when 'lem' then handle_lem(e)
|
344
|
+
when 'lg' then handle_lg(e)
|
345
|
+
when 'list' then handle_list(e)
|
346
|
+
when 'mulu' then handle_mulu(e)
|
347
|
+
when 'note' then handle_note(e)
|
348
|
+
when 'milestone' then handle_milestone(e)
|
349
|
+
when 'p' then handle_p(e)
|
350
|
+
when 'rdg' then ''
|
351
|
+
when 'reg' then ''
|
352
|
+
when 'row' then handle_row(e)
|
353
|
+
when 'sic' then ''
|
354
|
+
when 'sg' then handle_sg(e)
|
355
|
+
when 't' then handle_t(e)
|
356
|
+
when 'tt' then handle_tt(e)
|
357
|
+
when 'table' then handle_table(e)
|
358
|
+
else traverse(e)
|
359
|
+
end
|
360
|
+
r
|
361
|
+
end
|
362
|
+
|
363
|
+
def handle_note(e)
|
364
|
+
n = e['n']
|
365
|
+
if e.has_attribute?('type')
|
366
|
+
t = e['type']
|
367
|
+
if %w(equivalent orig orig_biao orig_ke mod rest).include? t
|
368
|
+
return ''
|
369
|
+
end
|
370
|
+
return '' if t.start_with?('cf')
|
371
|
+
end
|
372
|
+
|
373
|
+
if e.has_attribute?('resp')
|
374
|
+
return '' if e['resp'].start_with? 'CBETA'
|
375
|
+
end
|
376
|
+
|
377
|
+
r = traverse(e)
|
378
|
+
if e.has_attribute?('place')
|
379
|
+
if e['place']=='inline'
|
380
|
+
r = "(#{r})"
|
381
|
+
elsif e['place']=='interlinear'
|
382
|
+
r = "(#{r})"
|
383
|
+
end
|
384
|
+
end
|
385
|
+
r
|
386
|
+
end
|
387
|
+
|
388
|
+
def handle_p(e)
|
389
|
+
"<p>%s</p>\n" % traverse(e)
|
390
|
+
end
|
391
|
+
|
392
|
+
def handle_row(e)
|
393
|
+
"<tr>" + traverse(e) + "</tr>\n"
|
394
|
+
end
|
395
|
+
|
396
|
+
def handle_sg(e)
|
397
|
+
'(' + traverse(e) + ')'
|
398
|
+
end
|
399
|
+
|
400
|
+
def handle_sutra(xml_fn)
|
401
|
+
puts "convert sutra #{xml_fn}"
|
402
|
+
@back = { 0 => '' }
|
403
|
+
@char_count = 1
|
404
|
+
@dila_note = 0
|
405
|
+
@div_count = 0
|
406
|
+
@in_l = false
|
407
|
+
@juan = 0
|
408
|
+
@lg_row_open = false
|
409
|
+
@t_buf1 = []
|
410
|
+
@t_buf2 = []
|
411
|
+
@open_divs = []
|
412
|
+
@sutra_no = File.basename(xml_fn, ".xml")
|
413
|
+
|
414
|
+
@output_folder_sutra = File.join(@out_folder, @sutra_no)
|
415
|
+
FileUtils.mkdir_p(@output_folder_sutra) unless Dir.exist? @output_folder_sutra
|
416
|
+
|
417
|
+
src = File.join(CBETA::DATA, 'html-for-pdf.css')
|
418
|
+
dest = File.join(@output_folder_sutra, 'html-for-pdf.css')
|
419
|
+
FileUtils.copy(src, dest)
|
420
|
+
|
421
|
+
text = parse_xml(xml_fn)
|
422
|
+
text = "
|
423
|
+
<html>
|
424
|
+
<head>
|
425
|
+
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
|
426
|
+
<link rel=stylesheet type='text/css' href='html-for-pdf.css'>
|
427
|
+
</head>
|
428
|
+
<body>#{text}</body>
|
429
|
+
</html>"
|
430
|
+
|
431
|
+
fn = File.join(@output_folder_sutra, 'main.htm')
|
432
|
+
File.write(fn, text)
|
433
|
+
end
|
434
|
+
|
435
|
+
def handle_t(e)
|
436
|
+
if e.has_attribute? 'place'
|
437
|
+
return '' if e['place'].include? 'foot'
|
438
|
+
end
|
439
|
+
r = traverse(e)
|
440
|
+
|
441
|
+
# <tt type="app"> 不是 悉漢雙行對照
|
442
|
+
return r if @tt_type == 'app'
|
443
|
+
|
444
|
+
# 處理雙行對照
|
445
|
+
i = e.xpath('../t').index(e)
|
446
|
+
case i
|
447
|
+
when 0
|
448
|
+
@t_buf1 << r
|
449
|
+
when 1
|
450
|
+
@t_buf2 << r
|
451
|
+
else
|
452
|
+
return r
|
453
|
+
end
|
454
|
+
''
|
455
|
+
end
|
456
|
+
|
457
|
+
def handle_tt(e)
|
458
|
+
@tt_type = e['type']
|
459
|
+
traverse(e)
|
460
|
+
end
|
461
|
+
|
462
|
+
def handle_table(e)
|
463
|
+
"<table>" + traverse(e) + "</table>\n"
|
464
|
+
end
|
465
|
+
|
466
|
+
def handle_text(e, mode)
|
467
|
+
s = e.content().chomp
|
468
|
+
return '' if s.empty?
|
469
|
+
return '' if e.parent.name == 'app'
|
470
|
+
|
471
|
+
# cbeta xml 文字之間會有多餘的換行
|
472
|
+
r = s.gsub(/[\n\r]/, '')
|
473
|
+
|
474
|
+
# 把 & 轉為 &
|
475
|
+
r = CGI.escapeHTML(r)
|
476
|
+
|
477
|
+
r
|
478
|
+
end
|
479
|
+
|
480
|
+
def handle_vol(vol)
|
481
|
+
puts "convert volumn: #{vol}"
|
482
|
+
|
483
|
+
@orig = @cbeta.get_canon_abbr(vol[0])
|
484
|
+
abort "未處理底本" if @orig.nil?
|
485
|
+
|
486
|
+
@vol = vol
|
487
|
+
@series = vol[0]
|
488
|
+
@out_folder = File.join(@out_root, @series, vol)
|
489
|
+
FileUtils.remove_dir(@out_folder, force=true)
|
490
|
+
FileUtils::mkdir_p @out_folder
|
491
|
+
|
492
|
+
source = File.join(@xml_root, @series, vol)
|
493
|
+
Dir.entries(source).sort.each { |f|
|
494
|
+
next if f.start_with? '.'
|
495
|
+
path = File.join(source, f)
|
496
|
+
handle_sutra(path)
|
497
|
+
}
|
498
|
+
end
|
499
|
+
|
500
|
+
def handle_vols(v1, v2)
|
501
|
+
puts "convert volumns: #{v1}..#{v2}"
|
502
|
+
@series = v1[0]
|
503
|
+
folder = File.join(@xml_root, @series)
|
504
|
+
Dir.foreach(folder) { |vol|
|
505
|
+
next if vol < v1
|
506
|
+
next if vol > v2
|
507
|
+
handle_vol(vol)
|
508
|
+
}
|
509
|
+
end
|
510
|
+
|
511
|
+
def open_xml(fn)
|
512
|
+
s = File.read(fn)
|
513
|
+
|
514
|
+
if fn.include? 'T16n0657'
|
515
|
+
# 這個地方 雙行夾註 跨兩行偈頌
|
516
|
+
# 把 lb 移到 note 結束之前
|
517
|
+
# 讓 lg-row 先結束,再結束雙行夾註
|
518
|
+
s.sub!(/(<\/note>)(\n<lb n="0206b29" ed="T"\/>)/, '\2\1')
|
519
|
+
end
|
520
|
+
|
521
|
+
doc = Nokogiri::XML(s)
|
522
|
+
doc.remove_namespaces!()
|
523
|
+
doc
|
524
|
+
end
|
525
|
+
|
526
|
+
def parse_xml(xml_fn)
|
527
|
+
@pass = [false]
|
528
|
+
|
529
|
+
doc = open_xml(xml_fn)
|
530
|
+
|
531
|
+
e = doc.xpath("//titleStmt/title")[0]
|
532
|
+
@title = traverse(e, 'txt')
|
533
|
+
@title = @title.split()[-1]
|
534
|
+
|
535
|
+
e = doc.at_xpath("//editionStmt/edition/date")
|
536
|
+
abort "找不到版本日期" if e.nil?
|
537
|
+
@edition_date = e.text.sub(/\$Date: (.*?) \$$/, '\1')
|
538
|
+
|
539
|
+
e = doc.at_xpath("//projectDesc/p[@lang='zh']")
|
540
|
+
abort "找不到貢獻者" if e.nil?
|
541
|
+
@contributors = e.text
|
542
|
+
|
543
|
+
root = doc.root()
|
544
|
+
body = root.xpath("text/body")[0]
|
545
|
+
@pass = [true]
|
546
|
+
|
547
|
+
text = traverse(body)
|
548
|
+
text
|
549
|
+
end
|
550
|
+
|
551
|
+
def print_tt
|
552
|
+
r = "<table class='tt'>\n"
|
553
|
+
|
554
|
+
r += "<tr>\n"
|
555
|
+
@t_buf1.each do |s|
|
556
|
+
r += "<td>#{s}</td>"
|
557
|
+
end
|
558
|
+
r += "</tr>\n"
|
559
|
+
|
560
|
+
r += "<tr>\n"
|
561
|
+
@t_buf2.each do |s|
|
562
|
+
r += "<td>#{s}</td>"
|
563
|
+
end
|
564
|
+
r += "</tr>\n"
|
565
|
+
|
566
|
+
@t_buf1 = []
|
567
|
+
@t_buf2 = []
|
568
|
+
|
569
|
+
r + "<table>\n"
|
570
|
+
end
|
571
|
+
|
572
|
+
def to_html(e)
|
573
|
+
e.to_xml(encoding: 'UTF-8', :save_with => Nokogiri::XML::Node::SaveOptions::AS_XML)
|
574
|
+
end
|
575
|
+
|
576
|
+
def traverse(e, mode='html')
|
577
|
+
r = ''
|
578
|
+
e.children.each { |c|
|
579
|
+
s = handle_node(c, mode)
|
580
|
+
r += s
|
581
|
+
}
|
582
|
+
r
|
583
|
+
end
|
584
|
+
|
585
|
+
end
|
data/lib/cbeta.rb
CHANGED
@@ -6,6 +6,8 @@
|
|
6
6
|
require 'csv'
|
7
7
|
|
8
8
|
class CBETA
|
9
|
+
DATA = File.join(File.dirname(__FILE__), 'data')
|
10
|
+
|
9
11
|
# 將行首資訊轉為引用格式
|
10
12
|
#
|
11
13
|
# @param linehead [String] 行首資訊, 例如:T85n2838_p1291a03
|
@@ -102,9 +104,11 @@ end
|
|
102
104
|
|
103
105
|
require 'cbeta/gaiji'
|
104
106
|
require 'cbeta/bm_to_text'
|
107
|
+
require 'cbeta/html_to_pdf'
|
105
108
|
require 'cbeta/p5a_to_epub'
|
106
109
|
require 'cbeta/p5a_to_html'
|
107
110
|
require 'cbeta/p5a_to_html_for_every_edition'
|
111
|
+
require 'cbeta/p5a_to_html_for_pdf'
|
108
112
|
require 'cbeta/p5a_to_simple_html'
|
109
113
|
require 'cbeta/p5a_to_text'
|
110
114
|
require 'cbeta/p5a_validator'
|
@@ -0,0 +1,69 @@
|
|
1
|
+
div.lg {
|
2
|
+
display: table;
|
3
|
+
}
|
4
|
+
div.lg-cell {
|
5
|
+
display: table-cell;
|
6
|
+
}
|
7
|
+
div.lg-row {
|
8
|
+
display: table-row;
|
9
|
+
}
|
10
|
+
div.p {
|
11
|
+
margin-bottom: 20px;
|
12
|
+
line-height: 1.4;
|
13
|
+
text-indent: 2em;
|
14
|
+
}
|
15
|
+
p.byline {
|
16
|
+
text-align: right;
|
17
|
+
}
|
18
|
+
p.h1 {
|
19
|
+
text-indent: 2em;
|
20
|
+
font-weight: bold;
|
21
|
+
}
|
22
|
+
p.h2 {
|
23
|
+
text-indent: 3em;
|
24
|
+
font-weight: bold;
|
25
|
+
}
|
26
|
+
p.h3 {
|
27
|
+
text-indent: 4em;
|
28
|
+
font-weight: bold;
|
29
|
+
}
|
30
|
+
p.h4 {
|
31
|
+
text-indent: 2em;
|
32
|
+
font-weight: bold;
|
33
|
+
}
|
34
|
+
p.h5 {
|
35
|
+
text-indent: 3em;
|
36
|
+
font-weight: bold;
|
37
|
+
}
|
38
|
+
p.h6 {
|
39
|
+
text-indent: 4em;
|
40
|
+
font-weight: bold;
|
41
|
+
}
|
42
|
+
p.h7 {
|
43
|
+
text-indent: 2em;
|
44
|
+
font-weight: bold;
|
45
|
+
}
|
46
|
+
p.h8 {
|
47
|
+
text-indent: 2em;
|
48
|
+
font-weight: bold;
|
49
|
+
}
|
50
|
+
span.corr {
|
51
|
+
color: red;
|
52
|
+
}
|
53
|
+
table {
|
54
|
+
border-collapse: collapse;
|
55
|
+
}
|
56
|
+
table.tt, table.tt tbody, table.tt tbody tr, table.tt tbody tr td {
|
57
|
+
border: none;
|
58
|
+
}
|
59
|
+
th, td {
|
60
|
+
border: solid;
|
61
|
+
border-width: 1px;
|
62
|
+
padding: 5px;
|
63
|
+
word-wrap: break-word;
|
64
|
+
word-break: break-all;
|
65
|
+
text-indent: 0;
|
66
|
+
}
|
67
|
+
ul.simple {
|
68
|
+
list-style-type: none;
|
69
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|
@@ -19,10 +19,12 @@ files:
|
|
19
19
|
- lib/cbeta.rb
|
20
20
|
- lib/cbeta/bm_to_text.rb
|
21
21
|
- lib/cbeta/gaiji.rb
|
22
|
+
- lib/cbeta/html_to_pdf.rb
|
22
23
|
- lib/cbeta/html_to_text.rb
|
23
24
|
- lib/cbeta/p5a_to_epub.rb
|
24
25
|
- lib/cbeta/p5a_to_html.rb
|
25
26
|
- lib/cbeta/p5a_to_html_for_every_edition.rb
|
27
|
+
- lib/cbeta/p5a_to_html_for_pdf.rb
|
26
28
|
- lib/cbeta/p5a_to_simple_html.rb
|
27
29
|
- lib/cbeta/p5a_to_text.rb
|
28
30
|
- lib/cbeta/p5a_validator.rb
|
@@ -31,6 +33,7 @@ files:
|
|
31
33
|
- lib/data/epub-nav.xhtml
|
32
34
|
- lib/data/epub.css
|
33
35
|
- lib/data/gaiji.json
|
36
|
+
- lib/data/html-for-pdf.css
|
34
37
|
- lib/data/unicode-1.1.json
|
35
38
|
homepage: https://github.com/RayCHOU/ruby-cbeta
|
36
39
|
licenses:
|