cbeta 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,39 @@
1
+ require 'json'
2
+
3
+ # 存取 CBETA 缺字資料庫
4
+ class CBETA::Gaiji
5
+ # 載入 CBETA 缺字資料庫
6
+ def initialize()
7
+ fn = File.join(File.dirname(__FILE__), 'gaiji.json')
8
+ @gaijis = JSON.parse(File.read(fn))
9
+ end
10
+
11
+ # 傳入缺字 CB 碼,傳回 hash 缺字資訊
12
+ #
13
+ # 例如:
14
+ #
15
+ # g = Cbeta::Gaiji.new
16
+ # g["CB01002"]
17
+ #
18
+ # 回傳:
19
+ # {
20
+ # "zzs": "[得-彳]",
21
+ # "unicode": "3775",
22
+ # "unicode-char": "㝵",
23
+ # "zhuyin": [ "ㄉㄜˊ", "ㄞˋ" ]
24
+ # }
25
+ def [](cb)
26
+ @gaijis[cb]
27
+ end
28
+
29
+ # 傳入缺字 CB 碼,傳回注音 array
30
+ #
31
+ # 例如:
32
+ #
33
+ # g = Cbeta::Gaiji.new
34
+ # g.zhuyin("CB00023") # return [ "ㄍㄢˇ", "ㄍㄢ", "ㄧㄤˊ", "ㄇㄧˇ", "ㄇㄧㄝ", "ㄒㄧㄤˊ" ]
35
+ def zhuyin(cb)
36
+ return nil unless @gaijis.key? cb
37
+ @gaijis[cb]['zhuyin']
38
+ end
39
+ end
@@ -0,0 +1,151 @@
1
+ require 'fileutils'
2
+ require 'nokogiri'
3
+
4
+ # 將 CBETA HTML 轉為 純文字(含行首資訊)
5
+ #
6
+ # example:
7
+ #
8
+ # h2t = CBETA::HTMLToText.new('/temp/cbeta-html', '/temp/cbeta-text')
9
+ # h2t.convert("T01") # 轉換大正藏第一冊
10
+ class CBETA::HTMLToText
11
+ # html_root:: 來源 HTML 路徑
12
+ # out_root:: 輸出路徑
13
+ def initialize(html_root, out_root)
14
+ @html_root = html_root
15
+ @out_root = out_root
16
+ end
17
+
18
+ # 例如執行大正藏第一冊:
19
+ #
20
+ # convert("T01")
21
+ def convert(arg)
22
+ @dirty = false
23
+ @vol = arg.upcase
24
+ @corpus = @vol[0]
25
+ handle_vol
26
+ end
27
+
28
+ private
29
+
30
+ def traverse(e)
31
+ r = ''
32
+ e.children.each { |c|
33
+ r += handle_node(c)
34
+ }
35
+ r.gsub(' ', '')
36
+ end
37
+
38
+ def handle_text(e)
39
+ s = e.content().chomp
40
+ return '' if s.empty?
41
+ s.gsub(/[\n,、—!。:「]/, '')
42
+ end
43
+
44
+ def handle_span(e)
45
+ r = ''
46
+ case e['class']
47
+ when 'doube-line-note'
48
+ r = traverse(e)
49
+ unless r.start_with? '('
50
+ r = "(#{r})"
51
+ end
52
+ when 'lb'
53
+ if @dirty
54
+ r += "\n"
55
+ else
56
+ @dirty = true
57
+ end
58
+ # 行首資訊 T05n0220a 改為 T05n0220
59
+ lb = e['id'].sub(/^(T0\dn0220)[a-z](.*)$/, '\1\2')
60
+ r += lb + '║'
61
+ when 'lineInfo'
62
+ when 'ranja'
63
+ r = '【◇】'
64
+ when 'siddam'
65
+ r = '【◇】'
66
+ when 'star'
67
+ else
68
+ r = traverse(e)
69
+ end
70
+ r
71
+ end
72
+
73
+ def handle_node(e)
74
+ return '' if e.comment?
75
+ return handle_text(e) if e.text?
76
+ r = ''
77
+ case e.name
78
+ when 'a'
79
+ if e['class'] == 'gaijiAnchor'
80
+ id = e['href'][1..-1]
81
+ r = @gaiji[id]
82
+ else
83
+ r = traverse(e)
84
+ end
85
+ when 'div'
86
+ if e['id'] != 'back'
87
+ r = traverse(e)
88
+ end
89
+ when 'head'
90
+ when 'p'
91
+ if e['class'] == 'figure'
92
+ r = '【圖】'
93
+ else
94
+ r = traverse(e)
95
+ end
96
+ when 'span'
97
+ r = handle_span(e)
98
+ else
99
+ r = traverse(e)
100
+ end
101
+ r
102
+ end
103
+
104
+ def prepare_folder()
105
+ folder = File.join(@out_root, @corpus, @vol)
106
+ FileUtils.remove_dir(folder, force=true)
107
+ FileUtils.mkdir_p(folder)
108
+ folder
109
+ end
110
+
111
+ def handle_file(path)
112
+ sutra = File.basename(path, ".*")
113
+ sutra.sub!(/^(.*)_.*$/, '\1')
114
+ sutra.sub!(/(T\d\dn0220).*$/, '\1') # T0220 BM 沒有分 a, b, c...
115
+
116
+ if sutra != @last_sutra
117
+ txt_fn = sutra + '.txt'
118
+ txt_path = File.join(@folder_out, txt_fn)
119
+ puts "h2t #{txt_path}"
120
+ @fo = File.open(txt_path, 'w')
121
+ @last_sutra = sutra
122
+ @dirty = false
123
+ end
124
+
125
+ f = File.open(path)
126
+ doc = Nokogiri::HTML(f)
127
+ f.close
128
+
129
+ @gaiji = {}
130
+ doc.css("span.gaijiInfo").each { |e|
131
+ @gaiji[e['id']] = e['zzs']
132
+ }
133
+
134
+ text = traverse(doc.root)
135
+
136
+ # 悉曇字
137
+ text.gsub!(/(\((【◇】)+\)|(【◇】)|【◇】)+/, '【◇】')
138
+
139
+ @fo.write(text)
140
+ end
141
+
142
+ def handle_vol()
143
+ folder_in = File.join(@html_root, @corpus, @vol)
144
+ @folder_out = prepare_folder
145
+ @last_sutra = ''
146
+ Dir["#{folder_in}/*"].each { |f|
147
+ handle_file(f)
148
+ }
149
+ end
150
+
151
+ end
@@ -0,0 +1,664 @@
1
+ require 'cgi'
2
+ require 'date'
3
+ require 'fileutils'
4
+ require 'json'
5
+ require 'nokogiri'
6
+ require 'set'
7
+
8
+ # 內容不輸出的元素
9
+ PASS=['back', 'teiHeader']
10
+
11
+ # 某版用字缺的符號
12
+ MISSING = '-'
13
+
14
+ # 處理 CBETA XML P5a
15
+ #
16
+ # CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
17
+ #
18
+ # 轉檔規則請參考: http://wiki.ddbc.edu.tw/pages/CBETA_XML_P5a_轉_HTML
19
+ class CBETA::P5aToHTML
20
+
21
+ # xml_root:: 來源 CBETA XML P5a 路徑
22
+ # out_root:: 輸出 HTML 路徑
23
+ def initialize(xml_root, out_root)
24
+ @xml_root = xml_root
25
+ @out_root = out_root
26
+ @gaijis = CBETA::Gaiji.new
27
+
28
+ # 載入 unicode 1.1 字集列表
29
+ fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
30
+ json = File.read(fn)
31
+ @unicode1 = JSON.parse(json)
32
+ end
33
+
34
+ # 將 CBETA XML P5a 轉為 HTML
35
+ #
36
+ # 例如 轉出大正藏第一冊
37
+ #
38
+ # x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
39
+ # x2h.convert('T01')
40
+ #
41
+ # 例如 轉出大正藏全部
42
+ #
43
+ # x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
44
+ # x2h.convert('T')
45
+ #
46
+ # T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
47
+ def convert(arg=nil)
48
+ return convert_all if arg.nil?
49
+
50
+ arg.upcase!
51
+ if arg.size == 1
52
+ handle_collection(arg)
53
+ else
54
+ if arg.include? '..'
55
+ arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
56
+ handle_vols($1, $2)
57
+ }
58
+ else
59
+ handle_vol(arg)
60
+ end
61
+ end
62
+ end
63
+
64
+ private
65
+
66
+ def convert_all
67
+ Dir.foreach(@xml_root) { |c|
68
+ next unless c.match(/^[A-Z]$/)
69
+ handle_collection(c)
70
+ }
71
+ end
72
+
73
+ def handle_anchor(e)
74
+ id = e['id']
75
+ if e.has_attribute?('id')
76
+ if id.start_with?('nkr_note_orig')
77
+ note = @notes[id]
78
+ note_text = traverse(note)
79
+ n = id[/^nkr_note_orig_(.*)$/, 1]
80
+ @back[@juan] += "<span class='footnote' id='n#{n}'>#{note_text}</span>\n"
81
+ return "<a class='noteAnchor' href='#n#{n}'></a>"
82
+ elsif id.start_with? 'fx'
83
+ return "<span class='star'>[*]</span>"
84
+ end
85
+ end
86
+
87
+ if e.has_attribute?('type')
88
+ if e['type'] == 'circle'
89
+ return '◎'
90
+ end
91
+ end
92
+
93
+ ''
94
+ end
95
+
96
+ def handle_app(e)
97
+ r = ''
98
+ if e['type'] == 'star'
99
+ c = e['corresp'][1..-1]
100
+ r = "<a class='noteAnchor star' href='#n#{c}'></a>"
101
+ end
102
+ r + traverse(e)
103
+ end
104
+
105
+ def handle_byline(e)
106
+ r = '<p class="byline">'
107
+ r += "<span class='lineInfo'>#{@lb}</span>"
108
+ r += traverse(e)
109
+ r + '</p>'
110
+ end
111
+
112
+ def handle_collection(c)
113
+ @series = c
114
+ puts 'handle_collection ' + c
115
+ folder = File.join(@xml_root, @series)
116
+ Dir.foreach(folder) { |vol|
117
+ next if ['.', '..', '.DS_Store'].include? vol
118
+ handle_vol(vol)
119
+ }
120
+ end
121
+
122
+ def handle_corr(e)
123
+ r = ''
124
+ if e.parent.name == 'choice'
125
+ sic = e.parent.at_xpath('sic')
126
+ unless sic.nil?
127
+ @dila_note += 1
128
+ r = "<a class='noteAnchor dila' href='#dila_note#{@dila_note}'></a>"
129
+
130
+ note = @orig
131
+ sic_text = traverse(sic, 'back')
132
+ if sic_text.empty?
133
+ note += MISSING
134
+ else
135
+ note += sic_text
136
+ end
137
+ @back[@juan] += "<span class='footnote_dila' id='dila_note#{@dila_note}'>#{note}</span>\n"
138
+ end
139
+ end
140
+ r + "<span class='cbeta'>%s</span>" % traverse(e)
141
+ end
142
+
143
+ def handle_div(e)
144
+ @div_count += 1
145
+ n = @div_count
146
+ if e.has_attribute? 'type'
147
+ @open_divs << e
148
+ r = traverse(e)
149
+ @open_divs.pop
150
+ return "<!-- begin div#{n}--><div class='div-#{e['type']}'>#{r}</div><!-- end of div#{n} -->"
151
+ else
152
+ return traverse(e)
153
+ end
154
+ end
155
+
156
+ def handle_figure(e)
157
+ "<p class='figure'>%s</p>" % traverse(e)
158
+ end
159
+
160
+ def handle_g(e, mode)
161
+ # if 有 <mapping type="unicode">
162
+ # if 在 unicode 1.1 範圍裡
163
+ # 直接採用
164
+ # else
165
+ # 預設呈現 unicode, 但仍包缺字資訊,供點選開 popup
166
+ # else if 有 <mapping type="normal_unicode">
167
+ # 預設呈現 normal_unicode, 但仍包缺字資訊,供點選開 popup
168
+ # else if 有 normalized form
169
+ # 預設呈現 normalized form, 但仍包缺字資訊,供點選開 popup
170
+ # else
171
+ # 預設呈現組字式, 但仍包缺字資訊,供點選開 popup
172
+ gid = e['ref'][1..-1]
173
+ g = @gaijis[gid]
174
+ abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
175
+ zzs = g['zzs']
176
+
177
+ if mode == 'txt'
178
+ return g['roman'] if gid.start_with?('SD')
179
+ if zzs.nil?
180
+ abort "缺組字式:#{g}"
181
+ else
182
+ return zzs
183
+ end
184
+ end
185
+
186
+ @char_count += 1
187
+
188
+ if gid.start_with?('SD')
189
+ case gid
190
+ when 'SD-E35A'
191
+ return '('
192
+ when 'SD-E35B'
193
+ return ')'
194
+ else
195
+ return "<span class='siddam' roman='#{g['roman']}' code='#{gid}' char='#{g['sd-char']}'/>"
196
+ end
197
+ end
198
+
199
+ if gid.start_with?('RJ')
200
+ return "<span class='ranja' roman='#{g['roman']}' code='#{gid}' char='#{g['rj-char']}'/>"
201
+ end
202
+
203
+ default = ''
204
+ if g.has_key?('unicode')
205
+ if @unicode1.include?(g['unicode'])
206
+ return g['unicode-char'] # unicode 1.1 直接用
207
+ else
208
+ default = g['unicode-char']
209
+ end
210
+ end
211
+
212
+ nor = ''
213
+ if g.has_key?('normal_unicode')
214
+ nor = g['normal_unicode']
215
+ default = nor if default.empty?
216
+ end
217
+
218
+ if g.has_key?('normal')
219
+ nor += ', ' unless nor==''
220
+ nor += g['normal']
221
+ default = g['normal'] if default.empty?
222
+ end
223
+
224
+ default = zzs if default.empty?
225
+
226
+ href = 'http://dict.cbeta.org/dict_word/gaiji-cb/%s/%s.gif' % [gid[2, 2], gid]
227
+ unless @back[@juan].include?(href)
228
+ @back[@juan] += "<span id='#{gid}' class='gaijiInfo' figure_url='#{href}' zzs='#{zzs}' nor='#{nor}'>#{default}</span>\n"
229
+ end
230
+ "<a class='gaijiAnchor' href='##{gid}'>#{default}</a>"
231
+ end
232
+
233
+ def handle_graphic(e)
234
+ url = File.basename(e['url'])
235
+ "<span imgsrc='#{url}' class='graphic'></span>"
236
+ end
237
+
238
+ def handle_head(e)
239
+ r = ''
240
+ unless e['type'] == 'added'
241
+ r = "<p class='head'>%s</p>" % traverse(e)
242
+ end
243
+ r
244
+ end
245
+
246
+ def handle_item(e)
247
+ "<li>%s</li>\n" % traverse(e)
248
+ end
249
+
250
+ def handle_juan(e)
251
+ "<p class='juan'>%s</p>" % traverse(e)
252
+ end
253
+
254
+ def handle_l(e)
255
+ if @lg_type == 'abnormal'
256
+ return traverse(e)
257
+ end
258
+
259
+ @in_l = true
260
+
261
+ doc = Nokogiri::XML::Document.new
262
+ cell = doc.create_element('div')
263
+ cell['class'] = 'lg-cell'
264
+ cell.inner_html = traverse(e)
265
+
266
+ if @first_l
267
+ parent = e.parent()
268
+ if parent.has_attribute?('rend')
269
+ indent = parent['rend'].scan(/text-indent:[^:]*/)
270
+ unless indent.empty?
271
+ cell['style'] = indent[0]
272
+ end
273
+ end
274
+ @first_l = false
275
+ end
276
+ r = cell.to_s
277
+
278
+ unless @lg_row_open
279
+ r = "\n<div class='lg-row'>" + r
280
+ @lg_row_open = true
281
+ end
282
+ @in_l = false
283
+ r
284
+ end
285
+
286
+ def handle_lb(e)
287
+ @char_count = 1
288
+ @lb = e['n']
289
+ line_head = @sutra_no + '_p' + e['n']
290
+ r = ''
291
+ #if e.parent.name == 'lg' and $lg_row_open
292
+ if @lg_row_open && !@in_l
293
+ # 每行偈頌放在一個 lg-row 裡面
294
+ # T46n1937, p. 914a01, l 包雙行夾註跨行
295
+ # T20n1092, 337c16, lb 在 l 中間,不結束 lg-row
296
+ r += "</div><!-- end of lg-row -->"
297
+ @lg_row_open = false
298
+ end
299
+ r + "<span class='lb' \nid='#{line_head}'>#{line_head}</span>"
300
+ end
301
+
302
+ def handle_lem(e)
303
+ r = ''
304
+ w = e['wit']
305
+ if w.include? 'CBETA' and not w.include? @orig
306
+ @dila_note += 1
307
+ r = "<a class='noteAnchor dila' href='#dila_note#{@dila_note}'></a>"
308
+ r += "<span class='cbeta'>%s</span>" % traverse(e)
309
+
310
+ note = lem_note_cf(e)
311
+ note += lem_note_rdg(e)
312
+ @back[@juan] += "<span class='footnote_dila' id='dila_note#{@dila_note}'>#{note}</span>\n"
313
+ else
314
+ r = traverse(e)
315
+ end
316
+ r
317
+ end
318
+
319
+ def handle_lg(e)
320
+ r = ''
321
+ @lg_type = e['type']
322
+ if @lg_type == 'abnormal'
323
+ r = "<p class='lg-abnormal'>" + traverse(e) + "</p>"
324
+ else
325
+ @first_l = true
326
+ doc = Nokogiri::XML::Document.new
327
+ node = doc.create_element('div')
328
+ node['class'] = 'lg'
329
+ if e.has_attribute?('rend')
330
+ rend = e['rend'].gsub(/text-indent:[^:]*/, '')
331
+ node['style'] = rend
332
+ end
333
+ @lg_row_open = false
334
+ node.inner_html = traverse(e)
335
+ if @lg_row_open
336
+ node.inner_html += '</div><!-- end of lg -->'
337
+ @lg_row_open = false
338
+ end
339
+ r = "\n" + node.to_s
340
+ end
341
+ r
342
+ end
343
+
344
+ def handle_list(e)
345
+ "<ul>%s</ul>" % traverse(e)
346
+ end
347
+
348
+ def handle_milestone(e)
349
+ r = ''
350
+ if e['unit'] == 'juan'
351
+
352
+ r += "</div>" * @open_divs.size # 如果有 div 跨卷,要先結束, ex: T55n2154, p. 680a29, 跨 19, 20 兩卷
353
+ @juan = e['n'].to_i
354
+ @back[@juan] = @back[0]
355
+ r += "<juan #{@juan}>"
356
+ @open_divs.each { |d|
357
+ r += "<div class='#{d['type']}'>"
358
+ }
359
+ end
360
+ r
361
+ end
362
+
363
+ def handle_mulu(e)
364
+ r = ''
365
+ if e['type'] == '品'
366
+ @pass << false
367
+ r = "<mulu class='pin' s='%s'/>" % traverse(e, 'txt')
368
+ @pass.pop
369
+ end
370
+ r
371
+ end
372
+
373
+ def handle_node(e, mode)
374
+ return '' if e.comment?
375
+ return handle_text(e, mode) if e.text?
376
+ return '' if PASS.include?(e.name)
377
+ r = case e.name
378
+ when 'anchor' then handle_anchor(e)
379
+ when 'app' then handle_app(e)
380
+ when 'byline' then handle_byline(e)
381
+ when 'corr' then handle_corr(e)
382
+ when 'div' then handle_div(e)
383
+ when 'figure' then handle_figure(e)
384
+ when 'foreign' then ''
385
+ when 'g' then handle_g(e, mode)
386
+ when 'graphic' then handle_graphic(e)
387
+ when 'head' then handle_head(e)
388
+ when 'item' then handle_item(e)
389
+ when 'juan' then handle_juan(e)
390
+ when 'l' then handle_l(e)
391
+ when 'lb' then handle_lb(e)
392
+ when 'lem' then handle_lem(e)
393
+ when 'lg' then handle_lg(e)
394
+ when 'list' then handle_list(e)
395
+ when 'mulu' then handle_mulu(e)
396
+ when 'note' then handle_note(e)
397
+ when 'milestone' then handle_milestone(e)
398
+ when 'p' then handle_p(e)
399
+ when 'rdg' then ''
400
+ when 'reg' then ''
401
+ when 'sic' then ''
402
+ when 'sg' then handle_sg(e)
403
+ when 't' then handle_t(e)
404
+ else traverse(e)
405
+ end
406
+ r
407
+ end
408
+
409
+
410
+ def handle_note(e)
411
+ n = e['n']
412
+ if e.has_attribute?('type')
413
+ t = e['type']
414
+ case t
415
+ when 'equivalent'
416
+ return ''
417
+ when 'orig'
418
+ @pass << false
419
+ s = traverse(e)
420
+ @pass.pop
421
+ @back[@juan] += "<span class='footnote_orig' id='n#{n}'>#{s}</span>\n"
422
+
423
+ if @mod_notes.include? n
424
+ return ''
425
+ else
426
+ return "<a class='noteAnchor' href='#n#{n}'></a>"
427
+ end
428
+ when 'mod'
429
+ @pass << false
430
+ s = traverse(e)
431
+ @pass.pop
432
+ @back[@juan] += "<span class='footnote_cb' id='n#{n}'>#{s}</span>\n"
433
+ return "<a class='noteAnchor' href='#n#{n}'></a>"
434
+ when 'rest'
435
+ return ''
436
+ else
437
+ return '' if t.start_with?('cf')
438
+ end
439
+ end
440
+
441
+ if e.has_attribute?('resp')
442
+ return '' if e['resp'].start_with? 'CBETA'
443
+ end
444
+
445
+ if e.has_attribute?('place') && e['place']=='inline'
446
+ r = traverse(e)
447
+ return "<span class='doube-line-note'>#{r}</span>"
448
+ else
449
+ return traverse(e)
450
+ end
451
+ end
452
+
453
+ def handle_p(e)
454
+ r = '<p>'
455
+ r += "<span class='lineInfo'>#{@lb}</span>"
456
+ r += traverse(e)
457
+ r + '</p>'
458
+ end
459
+
460
+ def handle_sg(e)
461
+ '(' + traverse(e) + ')'
462
+ end
463
+
464
+ def handle_sutra(xml_fn)
465
+ puts "handle sutra #{xml_fn}"
466
+ @back = { 0 => '' }
467
+ @char_count = 1
468
+ @dila_note = 0
469
+ @div_count = 0
470
+ @in_l = false
471
+ @juan = 0
472
+ @lg_row_open = false
473
+ @mod_notes = Set.new
474
+ @open_divs = []
475
+ @sutra_no = File.basename(xml_fn, ".xml")
476
+
477
+ text = parse_xml(xml_fn)
478
+
479
+ # 註標移到 lg-cell 裡面,不然以 table 呈現 lg 會有問題
480
+ text.gsub!(/(<a class='noteAnchor'[^>]*><\/a>)(<div class="lg-cell"[^>]*>)/, '\2\1')
481
+
482
+ juans = text.split(/(<juan \d+>)/)
483
+ open = false
484
+ fo = nil
485
+ juan_no = nil
486
+ fn = ''
487
+ buf = ''
488
+ # 一卷一檔
489
+ juans.each { |j|
490
+ if j =~ /<juan (\d+)>$/
491
+ juan_no = $1.to_i
492
+ fn = "#{@sutra_no}_%03d.htm" % juan_no
493
+ output_path = File.join(@out_folder, fn)
494
+ fo = File.open(output_path, 'w')
495
+ open = true
496
+ s = <<eos
497
+ <html>
498
+ <head>
499
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
500
+ <meta name="filename" content="#{fn}" />
501
+ <title>#{@title}</title>
502
+ </head>
503
+ <body>
504
+ <!--
505
+ 來源 XML CBETA P5a: https://github.com/cbeta-org/xml-p5a.git
506
+ 轉檔程式: Dropbox/DILA-DA/cbeta-html/bin/x2h.rb Version #{Date.today}
507
+ 說明文件: http://wiki.ddbc.edu.tw/pages/CBETA_XML_P5a_%E8%BD%89_HTML
508
+ -->
509
+ <div id='body'>
510
+ eos
511
+ fo.write(s)
512
+ fo.write(buf)
513
+ buf = ''
514
+ elsif open
515
+ fo.write(j + "\n</div><!-- end of div[@id='body'] -->\n")
516
+ fo.write("<div id='back'>\n" + @back[juan_no] + "</div>\n")
517
+ fo.write('</body></html>')
518
+ fo.close
519
+ else
520
+ buf = j
521
+ end
522
+ }
523
+ end
524
+
525
+ def handle_t(e)
526
+ if e.has_attribute? 'place'
527
+ return '' if e['place'].include? 'foot'
528
+ end
529
+ traverse(e)
530
+ end
531
+
532
+ def handle_text(e, mode)
533
+ s = e.content().chomp
534
+ return '' if s.empty?
535
+ return '' if e.parent.name == 'app'
536
+
537
+ # cbeta xml 文字之間會有多餘的換行
538
+ r = s.gsub(/[\n\r]/, '')
539
+
540
+ # 把 & 轉為 &amp;
541
+ r = CGI.escapeHTML(r)
542
+
543
+ # 正文區的文字外面要包 span
544
+ if @pass.last and mode=='html'
545
+ r = "<span class='t' l='#{@lb}' w='#{@char_count}'>#{r}</span>"
546
+ @char_count += r.size
547
+ end
548
+ r
549
+ end
550
+
551
+ def handle_vol(vol)
552
+ puts 'x2h ' + vol
553
+ if vol.start_with? 'T'
554
+ @orig = "【大】"
555
+ else
556
+ abort "未處理底本"
557
+ end
558
+ @vol = vol
559
+ @series = vol[0]
560
+ @out_folder = File.join(@out_root, @series, vol)
561
+ FileUtils.remove_dir(@out_folder, force=true)
562
+ FileUtils::mkdir_p @out_folder
563
+
564
+ source = File.join(@xml_root, @series, vol)
565
+ Dir[source+"/*"].each { |f|
566
+ handle_sutra(f)
567
+ }
568
+ end
569
+
570
+ def handle_vols(v1, v2)
571
+ @series = v1[0]
572
+ folder = File.join(IN, @series)
573
+ Dir.foreach(folder) { |vol|
574
+ next if vol < v1
575
+ next if vol > v2
576
+ handle_vol(vol)
577
+ }
578
+ end
579
+
580
+ def lem_note_cf(e)
581
+ # ex: T32n1670A.xml, p. 703a16
582
+ # <note type="cf1">K30n1002_p0257a01-a23</note>
583
+ refs = []
584
+ e.xpath('./note').each { |n|
585
+ if n.key?('type') and n['type'].start_with? 'cf'
586
+ refs << n.content
587
+ end
588
+ }
589
+ if refs.empty?
590
+ ''
591
+ else
592
+ '修訂依據:' + refs.join(';') + '。'
593
+ end
594
+ end
595
+
596
+ def lem_note_rdg(lem)
597
+ r = ''
598
+ app = lem.parent
599
+ @pass << false
600
+ app.xpath('rdg').each { |rdg|
601
+ if rdg['wit'].include? @orig
602
+ s = traverse(rdg, 'back')
603
+ s = MISSING if s.empty?
604
+ r += @orig + s
605
+ end
606
+ }
607
+ @pass.pop
608
+ r += '。' unless r.empty?
609
+ r
610
+ end
611
+
612
+ def open_xml(fn)
613
+ s = File.read(fn)
614
+
615
+ if fn.include? 'T16n0657'
616
+ # 這個地方 雙行夾註 跨兩行偈頌
617
+ # 把 lb 移到 note 結束之前
618
+ # 讓 lg-row 先結束,再結束雙行夾註
619
+ s.sub!(/(<\/note>)(\n<lb n="0206b29" ed="T"\/>)/, '\2\1')
620
+ end
621
+
622
+ # <milestone unit="juan"> 前面的 lb 屬於新的這一卷
623
+ s.gsub!(/((?:<pb [^>]+>\n?)?<lb [^>]+>\n?)(<milestone [^>]*unit="juan"\/>)/, '\2\1')
624
+
625
+ doc = Nokogiri::XML(s)
626
+ doc.remove_namespaces!()
627
+ doc
628
+ end
629
+
630
+ def read_mod_notes(doc)
631
+ doc.xpath("//note[@type='mod']").each { |e|
632
+ @mod_notes << e['n']
633
+ }
634
+ end
635
+
636
+ def parse_xml(xml_fn)
637
+ @pass = [false]
638
+
639
+ doc = open_xml(xml_fn)
640
+
641
+ e = doc.xpath("//titleStmt/title")[0]
642
+ @title = traverse(e, 'txt')
643
+ @title = @title.split()[-1]
644
+
645
+ read_mod_notes(doc)
646
+
647
+ root = doc.root()
648
+ body = root.xpath("text/body")[0]
649
+ @pass = [true]
650
+
651
+ text = traverse(body)
652
+ text
653
+ end
654
+
655
+ def traverse(e, mode='html')
656
+ r = ''
657
+ e.children.each { |c|
658
+ s = handle_node(c, mode)
659
+ r += s
660
+ }
661
+ r
662
+ end
663
+
664
+ end