cbeta 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ require 'json'
2
+
3
+ # 存取 CBETA 缺字資料庫
4
+ class CBETA::Gaiji
5
+ # 載入 CBETA 缺字資料庫
6
+ def initialize()
7
+ fn = File.join(File.dirname(__FILE__), 'gaiji.json')
8
+ @gaijis = JSON.parse(File.read(fn))
9
+ end
10
+
11
+ # 傳入缺字 CB 碼,傳回 hash 缺字資訊
12
+ #
13
+ # 例如:
14
+ #
15
+ # g = Cbeta::Gaiji.new
16
+ # g["CB01002"]
17
+ #
18
+ # 回傳:
19
+ # {
20
+ # "zzs": "[得-彳]",
21
+ # "unicode": "3775",
22
+ # "unicode-char": "㝵",
23
+ # "zhuyin": [ "ㄉㄜˊ", "ㄞˋ" ]
24
+ # }
25
+ def [](cb)
26
+ @gaijis[cb]
27
+ end
28
+
29
+ # 傳入缺字 CB 碼,傳回注音 array
30
+ #
31
+ # 例如:
32
+ #
33
+ # g = Cbeta::Gaiji.new
34
+ # g.zhuyin("CB00023") # return [ "ㄍㄢˇ", "ㄍㄢ", "ㄧㄤˊ", "ㄇㄧˇ", "ㄇㄧㄝ", "ㄒㄧㄤˊ" ]
35
+ def zhuyin(cb)
36
+ return nil unless @gaijis.key? cb
37
+ @gaijis[cb]['zhuyin']
38
+ end
39
+ end
@@ -0,0 +1,151 @@
1
+ require 'fileutils'
2
+ require 'nokogiri'
3
+
4
+ # 將 CBETA HTML 轉為 純文字(含行首資訊)
5
+ #
6
+ # example:
7
+ #
8
+ # h2t = CBETA::HTMLToText.new('/temp/cbeta-html', '/temp/cbeta-text')
9
+ # h2t.convert("T01") # 轉換大正藏第一冊
10
+ class CBETA::HTMLToText
11
+ # html_root:: 來源 HTML 路徑
12
+ # out_root:: 輸出路徑
13
+ def initialize(html_root, out_root)
14
+ @html_root = html_root
15
+ @out_root = out_root
16
+ end
17
+
18
+ # 例如執行大正藏第一冊:
19
+ #
20
+ # convert("T01")
21
+ def convert(arg)
22
+ @dirty = false
23
+ @vol = arg.upcase
24
+ @corpus = @vol[0]
25
+ handle_vol
26
+ end
27
+
28
+ private
29
+
30
+ def traverse(e)
31
+ r = ''
32
+ e.children.each { |c|
33
+ r += handle_node(c)
34
+ }
35
+ r.gsub(' ', '')
36
+ end
37
+
38
+ def handle_text(e)
39
+ s = e.content().chomp
40
+ return '' if s.empty?
41
+ s.gsub(/[\n,、—!。:「]/, '')
42
+ end
43
+
44
+ def handle_span(e)
45
+ r = ''
46
+ case e['class']
47
+ when 'doube-line-note'
48
+ r = traverse(e)
49
+ unless r.start_with? '('
50
+ r = "(#{r})"
51
+ end
52
+ when 'lb'
53
+ if @dirty
54
+ r += "\n"
55
+ else
56
+ @dirty = true
57
+ end
58
+ # 行首資訊 T05n0220a 改為 T05n0220
59
+ lb = e['id'].sub(/^(T0\dn0220)[a-z](.*)$/, '\1\2')
60
+ r += lb + '║'
61
+ when 'lineInfo'
62
+ when 'ranja'
63
+ r = '【◇】'
64
+ when 'siddam'
65
+ r = '【◇】'
66
+ when 'star'
67
+ else
68
+ r = traverse(e)
69
+ end
70
+ r
71
+ end
72
+
73
+ def handle_node(e)
74
+ return '' if e.comment?
75
+ return handle_text(e) if e.text?
76
+ r = ''
77
+ case e.name
78
+ when 'a'
79
+ if e['class'] == 'gaijiAnchor'
80
+ id = e['href'][1..-1]
81
+ r = @gaiji[id]
82
+ else
83
+ r = traverse(e)
84
+ end
85
+ when 'div'
86
+ if e['id'] != 'back'
87
+ r = traverse(e)
88
+ end
89
+ when 'head'
90
+ when 'p'
91
+ if e['class'] == 'figure'
92
+ r = '【圖】'
93
+ else
94
+ r = traverse(e)
95
+ end
96
+ when 'span'
97
+ r = handle_span(e)
98
+ else
99
+ r = traverse(e)
100
+ end
101
+ r
102
+ end
103
+
104
+ def prepare_folder()
105
+ folder = File.join(@out_root, @corpus, @vol)
106
+ FileUtils.remove_dir(folder, force=true)
107
+ FileUtils.mkdir_p(folder)
108
+ folder
109
+ end
110
+
111
+ def handle_file(path)
112
+ sutra = File.basename(path, ".*")
113
+ sutra.sub!(/^(.*)_.*$/, '\1')
114
+ sutra.sub!(/(T\d\dn0220).*$/, '\1') # T0220 BM 沒有分 a, b, c...
115
+
116
+ if sutra != @last_sutra
117
+ txt_fn = sutra + '.txt'
118
+ txt_path = File.join(@folder_out, txt_fn)
119
+ puts "h2t #{txt_path}"
120
+ @fo = File.open(txt_path, 'w')
121
+ @last_sutra = sutra
122
+ @dirty = false
123
+ end
124
+
125
+ f = File.open(path)
126
+ doc = Nokogiri::HTML(f)
127
+ f.close
128
+
129
+ @gaiji = {}
130
+ doc.css("span.gaijiInfo").each { |e|
131
+ @gaiji[e['id']] = e['zzs']
132
+ }
133
+
134
+ text = traverse(doc.root)
135
+
136
+ # 悉曇字
137
+ text.gsub!(/(\((【◇】)+\)|(【◇】)|【◇】)+/, '【◇】')
138
+
139
+ @fo.write(text)
140
+ end
141
+
142
+ def handle_vol()
143
+ folder_in = File.join(@html_root, @corpus, @vol)
144
+ @folder_out = prepare_folder
145
+ @last_sutra = ''
146
+ Dir["#{folder_in}/*"].each { |f|
147
+ handle_file(f)
148
+ }
149
+ end
150
+
151
+ end
@@ -0,0 +1,664 @@
1
+ require 'cgi'
2
+ require 'date'
3
+ require 'fileutils'
4
+ require 'json'
5
+ require 'nokogiri'
6
+ require 'set'
7
+
8
+ # 內容不輸出的元素
9
+ PASS=['back', 'teiHeader']
10
+
11
+ # 某版用字缺的符號
12
+ MISSING = '-'
13
+
14
+ # 處理 CBETA XML P5a
15
+ #
16
+ # CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
17
+ #
18
+ # 轉檔規則請參考: http://wiki.ddbc.edu.tw/pages/CBETA_XML_P5a_轉_HTML
19
+ class CBETA::P5aToHTML
20
+
21
+ # xml_root:: 來源 CBETA XML P5a 路徑
22
+ # out_root:: 輸出 HTML 路徑
23
+ def initialize(xml_root, out_root)
24
+ @xml_root = xml_root
25
+ @out_root = out_root
26
+ @gaijis = CBETA::Gaiji.new
27
+
28
+ # 載入 unicode 1.1 字集列表
29
+ fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
30
+ json = File.read(fn)
31
+ @unicode1 = JSON.parse(json)
32
+ end
33
+
34
+ # 將 CBETA XML P5a 轉為 HTML
35
+ #
36
+ # 例如 轉出大正藏第一冊
37
+ #
38
+ # x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
39
+ # x2h.convert('T01')
40
+ #
41
+ # 例如 轉出大正藏全部
42
+ #
43
+ # x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
44
+ # x2h.convert('T')
45
+ #
46
+ # T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
47
+ def convert(arg=nil)
48
+ return convert_all if arg.nil?
49
+
50
+ arg.upcase!
51
+ if arg.size == 1
52
+ handle_collection(arg)
53
+ else
54
+ if arg.include? '..'
55
+ arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
56
+ handle_vols($1, $2)
57
+ }
58
+ else
59
+ handle_vol(arg)
60
+ end
61
+ end
62
+ end
63
+
64
+ private
65
+
66
+ def convert_all
67
+ Dir.foreach(@xml_root) { |c|
68
+ next unless c.match(/^[A-Z]$/)
69
+ handle_collection(c)
70
+ }
71
+ end
72
+
73
+ def handle_anchor(e)
74
+ id = e['id']
75
+ if e.has_attribute?('id')
76
+ if id.start_with?('nkr_note_orig')
77
+ note = @notes[id]
78
+ note_text = traverse(note)
79
+ n = id[/^nkr_note_orig_(.*)$/, 1]
80
+ @back[@juan] += "<span class='footnote' id='n#{n}'>#{note_text}</span>\n"
81
+ return "<a class='noteAnchor' href='#n#{n}'></a>"
82
+ elsif id.start_with? 'fx'
83
+ return "<span class='star'>[*]</span>"
84
+ end
85
+ end
86
+
87
+ if e.has_attribute?('type')
88
+ if e['type'] == 'circle'
89
+ return '◎'
90
+ end
91
+ end
92
+
93
+ ''
94
+ end
95
+
96
+ def handle_app(e)
97
+ r = ''
98
+ if e['type'] == 'star'
99
+ c = e['corresp'][1..-1]
100
+ r = "<a class='noteAnchor star' href='#n#{c}'></a>"
101
+ end
102
+ r + traverse(e)
103
+ end
104
+
105
+ def handle_byline(e)
106
+ r = '<p class="byline">'
107
+ r += "<span class='lineInfo'>#{@lb}</span>"
108
+ r += traverse(e)
109
+ r + '</p>'
110
+ end
111
+
112
+ def handle_collection(c)
113
+ @series = c
114
+ puts 'handle_collection ' + c
115
+ folder = File.join(@xml_root, @series)
116
+ Dir.foreach(folder) { |vol|
117
+ next if ['.', '..', '.DS_Store'].include? vol
118
+ handle_vol(vol)
119
+ }
120
+ end
121
+
122
+ def handle_corr(e)
123
+ r = ''
124
+ if e.parent.name == 'choice'
125
+ sic = e.parent.at_xpath('sic')
126
+ unless sic.nil?
127
+ @dila_note += 1
128
+ r = "<a class='noteAnchor dila' href='#dila_note#{@dila_note}'></a>"
129
+
130
+ note = @orig
131
+ sic_text = traverse(sic, 'back')
132
+ if sic_text.empty?
133
+ note += MISSING
134
+ else
135
+ note += sic_text
136
+ end
137
+ @back[@juan] += "<span class='footnote_dila' id='dila_note#{@dila_note}'>#{note}</span>\n"
138
+ end
139
+ end
140
+ r + "<span class='cbeta'>%s</span>" % traverse(e)
141
+ end
142
+
143
+ def handle_div(e)
144
+ @div_count += 1
145
+ n = @div_count
146
+ if e.has_attribute? 'type'
147
+ @open_divs << e
148
+ r = traverse(e)
149
+ @open_divs.pop
150
+ return "<!-- begin div#{n}--><div class='div-#{e['type']}'>#{r}</div><!-- end of div#{n} -->"
151
+ else
152
+ return traverse(e)
153
+ end
154
+ end
155
+
156
+ def handle_figure(e)
157
+ "<p class='figure'>%s</p>" % traverse(e)
158
+ end
159
+
160
+ def handle_g(e, mode)
161
+ # if 有 <mapping type="unicode">
162
+ # if 在 unicode 1.1 範圍裡
163
+ # 直接採用
164
+ # else
165
+ # 預設呈現 unicode, 但仍包缺字資訊,供點選開 popup
166
+ # else if 有 <mapping type="normal_unicode">
167
+ # 預設呈現 normal_unicode, 但仍包缺字資訊,供點選開 popup
168
+ # else if 有 normalized form
169
+ # 預設呈現 normalized form, 但仍包缺字資訊,供點選開 popup
170
+ # else
171
+ # 預設呈現組字式, 但仍包缺字資訊,供點選開 popup
172
+ gid = e['ref'][1..-1]
173
+ g = @gaijis[gid]
174
+ abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
175
+ zzs = g['zzs']
176
+
177
+ if mode == 'txt'
178
+ return g['roman'] if gid.start_with?('SD')
179
+ if zzs.nil?
180
+ abort "缺組字式:#{g}"
181
+ else
182
+ return zzs
183
+ end
184
+ end
185
+
186
+ @char_count += 1
187
+
188
+ if gid.start_with?('SD')
189
+ case gid
190
+ when 'SD-E35A'
191
+ return '('
192
+ when 'SD-E35B'
193
+ return ')'
194
+ else
195
+ return "<span class='siddam' roman='#{g['roman']}' code='#{gid}' char='#{g['sd-char']}'/>"
196
+ end
197
+ end
198
+
199
+ if gid.start_with?('RJ')
200
+ return "<span class='ranja' roman='#{g['roman']}' code='#{gid}' char='#{g['rj-char']}'/>"
201
+ end
202
+
203
+ default = ''
204
+ if g.has_key?('unicode')
205
+ if @unicode1.include?(g['unicode'])
206
+ return g['unicode-char'] # unicode 1.1 直接用
207
+ else
208
+ default = g['unicode-char']
209
+ end
210
+ end
211
+
212
+ nor = ''
213
+ if g.has_key?('normal_unicode')
214
+ nor = g['normal_unicode']
215
+ default = nor if default.empty?
216
+ end
217
+
218
+ if g.has_key?('normal')
219
+ nor += ', ' unless nor==''
220
+ nor += g['normal']
221
+ default = g['normal'] if default.empty?
222
+ end
223
+
224
+ default = zzs if default.empty?
225
+
226
+ href = 'http://dict.cbeta.org/dict_word/gaiji-cb/%s/%s.gif' % [gid[2, 2], gid]
227
+ unless @back[@juan].include?(href)
228
+ @back[@juan] += "<span id='#{gid}' class='gaijiInfo' figure_url='#{href}' zzs='#{zzs}' nor='#{nor}'>#{default}</span>\n"
229
+ end
230
+ "<a class='gaijiAnchor' href='##{gid}'>#{default}</a>"
231
+ end
232
+
233
+ def handle_graphic(e)
234
+ url = File.basename(e['url'])
235
+ "<span imgsrc='#{url}' class='graphic'></span>"
236
+ end
237
+
238
+ def handle_head(e)
239
+ r = ''
240
+ unless e['type'] == 'added'
241
+ r = "<p class='head'>%s</p>" % traverse(e)
242
+ end
243
+ r
244
+ end
245
+
246
+ def handle_item(e)
247
+ "<li>%s</li>\n" % traverse(e)
248
+ end
249
+
250
+ def handle_juan(e)
251
+ "<p class='juan'>%s</p>" % traverse(e)
252
+ end
253
+
254
+ def handle_l(e)
255
+ if @lg_type == 'abnormal'
256
+ return traverse(e)
257
+ end
258
+
259
+ @in_l = true
260
+
261
+ doc = Nokogiri::XML::Document.new
262
+ cell = doc.create_element('div')
263
+ cell['class'] = 'lg-cell'
264
+ cell.inner_html = traverse(e)
265
+
266
+ if @first_l
267
+ parent = e.parent()
268
+ if parent.has_attribute?('rend')
269
+ indent = parent['rend'].scan(/text-indent:[^:]*/)
270
+ unless indent.empty?
271
+ cell['style'] = indent[0]
272
+ end
273
+ end
274
+ @first_l = false
275
+ end
276
+ r = cell.to_s
277
+
278
+ unless @lg_row_open
279
+ r = "\n<div class='lg-row'>" + r
280
+ @lg_row_open = true
281
+ end
282
+ @in_l = false
283
+ r
284
+ end
285
+
286
+ def handle_lb(e)
287
+ @char_count = 1
288
+ @lb = e['n']
289
+ line_head = @sutra_no + '_p' + e['n']
290
+ r = ''
291
+ #if e.parent.name == 'lg' and $lg_row_open
292
+ if @lg_row_open && !@in_l
293
+ # 每行偈頌放在一個 lg-row 裡面
294
+ # T46n1937, p. 914a01, l 包雙行夾註跨行
295
+ # T20n1092, 337c16, lb 在 l 中間,不結束 lg-row
296
+ r += "</div><!-- end of lg-row -->"
297
+ @lg_row_open = false
298
+ end
299
+ r + "<span class='lb' \nid='#{line_head}'>#{line_head}</span>"
300
+ end
301
+
302
+ def handle_lem(e)
303
+ r = ''
304
+ w = e['wit']
305
+ if w.include? 'CBETA' and not w.include? @orig
306
+ @dila_note += 1
307
+ r = "<a class='noteAnchor dila' href='#dila_note#{@dila_note}'></a>"
308
+ r += "<span class='cbeta'>%s</span>" % traverse(e)
309
+
310
+ note = lem_note_cf(e)
311
+ note += lem_note_rdg(e)
312
+ @back[@juan] += "<span class='footnote_dila' id='dila_note#{@dila_note}'>#{note}</span>\n"
313
+ else
314
+ r = traverse(e)
315
+ end
316
+ r
317
+ end
318
+
319
+ def handle_lg(e)
320
+ r = ''
321
+ @lg_type = e['type']
322
+ if @lg_type == 'abnormal'
323
+ r = "<p class='lg-abnormal'>" + traverse(e) + "</p>"
324
+ else
325
+ @first_l = true
326
+ doc = Nokogiri::XML::Document.new
327
+ node = doc.create_element('div')
328
+ node['class'] = 'lg'
329
+ if e.has_attribute?('rend')
330
+ rend = e['rend'].gsub(/text-indent:[^:]*/, '')
331
+ node['style'] = rend
332
+ end
333
+ @lg_row_open = false
334
+ node.inner_html = traverse(e)
335
+ if @lg_row_open
336
+ node.inner_html += '</div><!-- end of lg -->'
337
+ @lg_row_open = false
338
+ end
339
+ r = "\n" + node.to_s
340
+ end
341
+ r
342
+ end
343
+
344
+ def handle_list(e)
345
+ "<ul>%s</ul>" % traverse(e)
346
+ end
347
+
348
+ def handle_milestone(e)
349
+ r = ''
350
+ if e['unit'] == 'juan'
351
+
352
+ r += "</div>" * @open_divs.size # 如果有 div 跨卷,要先結束, ex: T55n2154, p. 680a29, 跨 19, 20 兩卷
353
+ @juan = e['n'].to_i
354
+ @back[@juan] = @back[0]
355
+ r += "<juan #{@juan}>"
356
+ @open_divs.each { |d|
357
+ r += "<div class='#{d['type']}'>"
358
+ }
359
+ end
360
+ r
361
+ end
362
+
363
+ def handle_mulu(e)
364
+ r = ''
365
+ if e['type'] == '品'
366
+ @pass << false
367
+ r = "<mulu class='pin' s='%s'/>" % traverse(e, 'txt')
368
+ @pass.pop
369
+ end
370
+ r
371
+ end
372
+
373
+ def handle_node(e, mode)
374
+ return '' if e.comment?
375
+ return handle_text(e, mode) if e.text?
376
+ return '' if PASS.include?(e.name)
377
+ r = case e.name
378
+ when 'anchor' then handle_anchor(e)
379
+ when 'app' then handle_app(e)
380
+ when 'byline' then handle_byline(e)
381
+ when 'corr' then handle_corr(e)
382
+ when 'div' then handle_div(e)
383
+ when 'figure' then handle_figure(e)
384
+ when 'foreign' then ''
385
+ when 'g' then handle_g(e, mode)
386
+ when 'graphic' then handle_graphic(e)
387
+ when 'head' then handle_head(e)
388
+ when 'item' then handle_item(e)
389
+ when 'juan' then handle_juan(e)
390
+ when 'l' then handle_l(e)
391
+ when 'lb' then handle_lb(e)
392
+ when 'lem' then handle_lem(e)
393
+ when 'lg' then handle_lg(e)
394
+ when 'list' then handle_list(e)
395
+ when 'mulu' then handle_mulu(e)
396
+ when 'note' then handle_note(e)
397
+ when 'milestone' then handle_milestone(e)
398
+ when 'p' then handle_p(e)
399
+ when 'rdg' then ''
400
+ when 'reg' then ''
401
+ when 'sic' then ''
402
+ when 'sg' then handle_sg(e)
403
+ when 't' then handle_t(e)
404
+ else traverse(e)
405
+ end
406
+ r
407
+ end
408
+
409
+
410
+ def handle_note(e)
411
+ n = e['n']
412
+ if e.has_attribute?('type')
413
+ t = e['type']
414
+ case t
415
+ when 'equivalent'
416
+ return ''
417
+ when 'orig'
418
+ @pass << false
419
+ s = traverse(e)
420
+ @pass.pop
421
+ @back[@juan] += "<span class='footnote_orig' id='n#{n}'>#{s}</span>\n"
422
+
423
+ if @mod_notes.include? n
424
+ return ''
425
+ else
426
+ return "<a class='noteAnchor' href='#n#{n}'></a>"
427
+ end
428
+ when 'mod'
429
+ @pass << false
430
+ s = traverse(e)
431
+ @pass.pop
432
+ @back[@juan] += "<span class='footnote_cb' id='n#{n}'>#{s}</span>\n"
433
+ return "<a class='noteAnchor' href='#n#{n}'></a>"
434
+ when 'rest'
435
+ return ''
436
+ else
437
+ return '' if t.start_with?('cf')
438
+ end
439
+ end
440
+
441
+ if e.has_attribute?('resp')
442
+ return '' if e['resp'].start_with? 'CBETA'
443
+ end
444
+
445
+ if e.has_attribute?('place') && e['place']=='inline'
446
+ r = traverse(e)
447
+ return "<span class='doube-line-note'>#{r}</span>"
448
+ else
449
+ return traverse(e)
450
+ end
451
+ end
452
+
453
+ def handle_p(e)
454
+ r = '<p>'
455
+ r += "<span class='lineInfo'>#{@lb}</span>"
456
+ r += traverse(e)
457
+ r + '</p>'
458
+ end
459
+
460
+ def handle_sg(e)
461
+ '(' + traverse(e) + ')'
462
+ end
463
+
464
+ def handle_sutra(xml_fn)
465
+ puts "handle sutra #{xml_fn}"
466
+ @back = { 0 => '' }
467
+ @char_count = 1
468
+ @dila_note = 0
469
+ @div_count = 0
470
+ @in_l = false
471
+ @juan = 0
472
+ @lg_row_open = false
473
+ @mod_notes = Set.new
474
+ @open_divs = []
475
+ @sutra_no = File.basename(xml_fn, ".xml")
476
+
477
+ text = parse_xml(xml_fn)
478
+
479
+ # 註標移到 lg-cell 裡面,不然以 table 呈現 lg 會有問題
480
+ text.gsub!(/(<a class='noteAnchor'[^>]*><\/a>)(<div class="lg-cell"[^>]*>)/, '\2\1')
481
+
482
+ juans = text.split(/(<juan \d+>)/)
483
+ open = false
484
+ fo = nil
485
+ juan_no = nil
486
+ fn = ''
487
+ buf = ''
488
+ # 一卷一檔
489
+ juans.each { |j|
490
+ if j =~ /<juan (\d+)>$/
491
+ juan_no = $1.to_i
492
+ fn = "#{@sutra_no}_%03d.htm" % juan_no
493
+ output_path = File.join(@out_folder, fn)
494
+ fo = File.open(output_path, 'w')
495
+ open = true
496
+ s = <<eos
497
+ <html>
498
+ <head>
499
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
500
+ <meta name="filename" content="#{fn}" />
501
+ <title>#{@title}</title>
502
+ </head>
503
+ <body>
504
+ <!--
505
+ 來源 XML CBETA P5a: https://github.com/cbeta-org/xml-p5a.git
506
+ 轉檔程式: Dropbox/DILA-DA/cbeta-html/bin/x2h.rb Version #{Date.today}
507
+ 說明文件: http://wiki.ddbc.edu.tw/pages/CBETA_XML_P5a_%E8%BD%89_HTML
508
+ -->
509
+ <div id='body'>
510
+ eos
511
+ fo.write(s)
512
+ fo.write(buf)
513
+ buf = ''
514
+ elsif open
515
+ fo.write(j + "\n</div><!-- end of div[@id='body'] -->\n")
516
+ fo.write("<div id='back'>\n" + @back[juan_no] + "</div>\n")
517
+ fo.write('</body></html>')
518
+ fo.close
519
+ else
520
+ buf = j
521
+ end
522
+ }
523
+ end
524
+
525
+ def handle_t(e)
526
+ if e.has_attribute? 'place'
527
+ return '' if e['place'].include? 'foot'
528
+ end
529
+ traverse(e)
530
+ end
531
+
532
+ def handle_text(e, mode)
533
+ s = e.content().chomp
534
+ return '' if s.empty?
535
+ return '' if e.parent.name == 'app'
536
+
537
+ # cbeta xml 文字之間會有多餘的換行
538
+ r = s.gsub(/[\n\r]/, '')
539
+
540
+ # 把 & 轉為 &amp;
541
+ r = CGI.escapeHTML(r)
542
+
543
+ # 正文區的文字外面要包 span
544
+ if @pass.last and mode=='html'
545
+ r = "<span class='t' l='#{@lb}' w='#{@char_count}'>#{r}</span>"
546
+ @char_count += r.size
547
+ end
548
+ r
549
+ end
550
+
551
+ def handle_vol(vol)
552
+ puts 'x2h ' + vol
553
+ if vol.start_with? 'T'
554
+ @orig = "【大】"
555
+ else
556
+ abort "未處理底本"
557
+ end
558
+ @vol = vol
559
+ @series = vol[0]
560
+ @out_folder = File.join(@out_root, @series, vol)
561
+ FileUtils.remove_dir(@out_folder, force=true)
562
+ FileUtils::mkdir_p @out_folder
563
+
564
+ source = File.join(@xml_root, @series, vol)
565
+ Dir[source+"/*"].each { |f|
566
+ handle_sutra(f)
567
+ }
568
+ end
569
+
570
+ def handle_vols(v1, v2)
571
+ @series = v1[0]
572
+ folder = File.join(IN, @series)
573
+ Dir.foreach(folder) { |vol|
574
+ next if vol < v1
575
+ next if vol > v2
576
+ handle_vol(vol)
577
+ }
578
+ end
579
+
580
+ def lem_note_cf(e)
581
+ # ex: T32n1670A.xml, p. 703a16
582
+ # <note type="cf1">K30n1002_p0257a01-a23</note>
583
+ refs = []
584
+ e.xpath('./note').each { |n|
585
+ if n.key?('type') and n['type'].start_with? 'cf'
586
+ refs << n.content
587
+ end
588
+ }
589
+ if refs.empty?
590
+ ''
591
+ else
592
+ '修訂依據:' + refs.join(';') + '。'
593
+ end
594
+ end
595
+
596
+ def lem_note_rdg(lem)
597
+ r = ''
598
+ app = lem.parent
599
+ @pass << false
600
+ app.xpath('rdg').each { |rdg|
601
+ if rdg['wit'].include? @orig
602
+ s = traverse(rdg, 'back')
603
+ s = MISSING if s.empty?
604
+ r += @orig + s
605
+ end
606
+ }
607
+ @pass.pop
608
+ r += '。' unless r.empty?
609
+ r
610
+ end
611
+
612
+ def open_xml(fn)
613
+ s = File.read(fn)
614
+
615
+ if fn.include? 'T16n0657'
616
+ # 這個地方 雙行夾註 跨兩行偈頌
617
+ # 把 lb 移到 note 結束之前
618
+ # 讓 lg-row 先結束,再結束雙行夾註
619
+ s.sub!(/(<\/note>)(\n<lb n="0206b29" ed="T"\/>)/, '\2\1')
620
+ end
621
+
622
+ # <milestone unit="juan"> 前面的 lb 屬於新的這一卷
623
+ s.gsub!(/((?:<pb [^>]+>\n?)?<lb [^>]+>\n?)(<milestone [^>]*unit="juan"\/>)/, '\2\1')
624
+
625
+ doc = Nokogiri::XML(s)
626
+ doc.remove_namespaces!()
627
+ doc
628
+ end
629
+
630
+ def read_mod_notes(doc)
631
+ doc.xpath("//note[@type='mod']").each { |e|
632
+ @mod_notes << e['n']
633
+ }
634
+ end
635
+
636
+ def parse_xml(xml_fn)
637
+ @pass = [false]
638
+
639
+ doc = open_xml(xml_fn)
640
+
641
+ e = doc.xpath("//titleStmt/title")[0]
642
+ @title = traverse(e, 'txt')
643
+ @title = @title.split()[-1]
644
+
645
+ read_mod_notes(doc)
646
+
647
+ root = doc.root()
648
+ body = root.xpath("text/body")[0]
649
+ @pass = [true]
650
+
651
+ text = traverse(body)
652
+ text
653
+ end
654
+
655
+ def traverse(e, mode='html')
656
+ r = ''
657
+ e.children.each { |c|
658
+ s = handle_node(c, mode)
659
+ r += s
660
+ }
661
+ r
662
+ end
663
+
664
+ end