cbeta 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/cbeta.rb +11 -0
- data/lib/cbeta/bm_to_text.rb +94 -0
- data/lib/cbeta/gaiji.json +126188 -0
- data/lib/cbeta/gaiji.rb +39 -0
- data/lib/cbeta/html_to_text.rb +151 -0
- data/lib/cbeta/p5a_to_html.rb +664 -0
- data/lib/cbeta/unicode-1.1.json +1 -0
- metadata +50 -0
data/lib/cbeta/gaiji.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
# 存取 CBETA 缺字資料庫
|
4
|
+
class CBETA::Gaiji
|
5
|
+
# 載入 CBETA 缺字資料庫
|
6
|
+
def initialize()
|
7
|
+
fn = File.join(File.dirname(__FILE__), 'gaiji.json')
|
8
|
+
@gaijis = JSON.parse(File.read(fn))
|
9
|
+
end
|
10
|
+
|
11
|
+
# 傳入缺字 CB 碼,傳回 hash 缺字資訊
|
12
|
+
#
|
13
|
+
# 例如:
|
14
|
+
#
|
15
|
+
# g = Cbeta::Gaiji.new
|
16
|
+
# g["CB01002"]
|
17
|
+
#
|
18
|
+
# 回傳:
|
19
|
+
# {
|
20
|
+
# "zzs": "[得-彳]",
|
21
|
+
# "unicode": "3775",
|
22
|
+
# "unicode-char": "㝵",
|
23
|
+
# "zhuyin": [ "ㄉㄜˊ", "ㄞˋ" ]
|
24
|
+
# }
|
25
|
+
def [](cb)
|
26
|
+
@gaijis[cb]
|
27
|
+
end
|
28
|
+
|
29
|
+
# 傳入缺字 CB 碼,傳回注音 array
|
30
|
+
#
|
31
|
+
# 例如:
|
32
|
+
#
|
33
|
+
# g = Cbeta::Gaiji.new
|
34
|
+
# g.zhuyin("CB00023") # return [ "ㄍㄢˇ", "ㄍㄢ", "ㄧㄤˊ", "ㄇㄧˇ", "ㄇㄧㄝ", "ㄒㄧㄤˊ" ]
|
35
|
+
def zhuyin(cb)
|
36
|
+
return nil unless @gaijis.key? cb
|
37
|
+
@gaijis[cb]['zhuyin']
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,151 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
# 將 CBETA HTML 轉為 純文字(含行首資訊)
|
5
|
+
#
|
6
|
+
# example:
|
7
|
+
#
|
8
|
+
# h2t = CBETA::HTMLToText.new('/temp/cbeta-html', '/temp/cbeta-text')
|
9
|
+
# h2t.convert("T01") # 轉換大正藏第一冊
|
10
|
+
class CBETA::HTMLToText
|
11
|
+
# html_root:: 來源 HTML 路徑
|
12
|
+
# out_root:: 輸出路徑
|
13
|
+
def initialize(html_root, out_root)
|
14
|
+
@html_root = html_root
|
15
|
+
@out_root = out_root
|
16
|
+
end
|
17
|
+
|
18
|
+
# 例如執行大正藏第一冊:
|
19
|
+
#
|
20
|
+
# convert("T01")
|
21
|
+
def convert(arg)
|
22
|
+
@dirty = false
|
23
|
+
@vol = arg.upcase
|
24
|
+
@corpus = @vol[0]
|
25
|
+
handle_vol
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def traverse(e)
|
31
|
+
r = ''
|
32
|
+
e.children.each { |c|
|
33
|
+
r += handle_node(c)
|
34
|
+
}
|
35
|
+
r.gsub(' ', '')
|
36
|
+
end
|
37
|
+
|
38
|
+
def handle_text(e)
|
39
|
+
s = e.content().chomp
|
40
|
+
return '' if s.empty?
|
41
|
+
s.gsub(/[\n,、—!。:「]/, '')
|
42
|
+
end
|
43
|
+
|
44
|
+
def handle_span(e)
|
45
|
+
r = ''
|
46
|
+
case e['class']
|
47
|
+
when 'doube-line-note'
|
48
|
+
r = traverse(e)
|
49
|
+
unless r.start_with? '('
|
50
|
+
r = "(#{r})"
|
51
|
+
end
|
52
|
+
when 'lb'
|
53
|
+
if @dirty
|
54
|
+
r += "\n"
|
55
|
+
else
|
56
|
+
@dirty = true
|
57
|
+
end
|
58
|
+
# 行首資訊 T05n0220a 改為 T05n0220
|
59
|
+
lb = e['id'].sub(/^(T0\dn0220)[a-z](.*)$/, '\1\2')
|
60
|
+
r += lb + '║'
|
61
|
+
when 'lineInfo'
|
62
|
+
when 'ranja'
|
63
|
+
r = '【◇】'
|
64
|
+
when 'siddam'
|
65
|
+
r = '【◇】'
|
66
|
+
when 'star'
|
67
|
+
else
|
68
|
+
r = traverse(e)
|
69
|
+
end
|
70
|
+
r
|
71
|
+
end
|
72
|
+
|
73
|
+
def handle_node(e)
|
74
|
+
return '' if e.comment?
|
75
|
+
return handle_text(e) if e.text?
|
76
|
+
r = ''
|
77
|
+
case e.name
|
78
|
+
when 'a'
|
79
|
+
if e['class'] == 'gaijiAnchor'
|
80
|
+
id = e['href'][1..-1]
|
81
|
+
r = @gaiji[id]
|
82
|
+
else
|
83
|
+
r = traverse(e)
|
84
|
+
end
|
85
|
+
when 'div'
|
86
|
+
if e['id'] != 'back'
|
87
|
+
r = traverse(e)
|
88
|
+
end
|
89
|
+
when 'head'
|
90
|
+
when 'p'
|
91
|
+
if e['class'] == 'figure'
|
92
|
+
r = '【圖】'
|
93
|
+
else
|
94
|
+
r = traverse(e)
|
95
|
+
end
|
96
|
+
when 'span'
|
97
|
+
r = handle_span(e)
|
98
|
+
else
|
99
|
+
r = traverse(e)
|
100
|
+
end
|
101
|
+
r
|
102
|
+
end
|
103
|
+
|
104
|
+
def prepare_folder()
|
105
|
+
folder = File.join(@out_root, @corpus, @vol)
|
106
|
+
FileUtils.remove_dir(folder, force=true)
|
107
|
+
FileUtils.mkdir_p(folder)
|
108
|
+
folder
|
109
|
+
end
|
110
|
+
|
111
|
+
def handle_file(path)
|
112
|
+
sutra = File.basename(path, ".*")
|
113
|
+
sutra.sub!(/^(.*)_.*$/, '\1')
|
114
|
+
sutra.sub!(/(T\d\dn0220).*$/, '\1') # T0220 BM 沒有分 a, b, c...
|
115
|
+
|
116
|
+
if sutra != @last_sutra
|
117
|
+
txt_fn = sutra + '.txt'
|
118
|
+
txt_path = File.join(@folder_out, txt_fn)
|
119
|
+
puts "h2t #{txt_path}"
|
120
|
+
@fo = File.open(txt_path, 'w')
|
121
|
+
@last_sutra = sutra
|
122
|
+
@dirty = false
|
123
|
+
end
|
124
|
+
|
125
|
+
f = File.open(path)
|
126
|
+
doc = Nokogiri::HTML(f)
|
127
|
+
f.close
|
128
|
+
|
129
|
+
@gaiji = {}
|
130
|
+
doc.css("span.gaijiInfo").each { |e|
|
131
|
+
@gaiji[e['id']] = e['zzs']
|
132
|
+
}
|
133
|
+
|
134
|
+
text = traverse(doc.root)
|
135
|
+
|
136
|
+
# 悉曇字
|
137
|
+
text.gsub!(/(\((【◇】)+\)|(【◇】)|【◇】)+/, '【◇】')
|
138
|
+
|
139
|
+
@fo.write(text)
|
140
|
+
end
|
141
|
+
|
142
|
+
def handle_vol()
|
143
|
+
folder_in = File.join(@html_root, @corpus, @vol)
|
144
|
+
@folder_out = prepare_folder
|
145
|
+
@last_sutra = ''
|
146
|
+
Dir["#{folder_in}/*"].each { |f|
|
147
|
+
handle_file(f)
|
148
|
+
}
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
@@ -0,0 +1,664 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'date'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'json'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
# 內容不輸出的元素
|
9
|
+
PASS=['back', 'teiHeader']
|
10
|
+
|
11
|
+
# 某版用字缺的符號
|
12
|
+
MISSING = '-'
|
13
|
+
|
14
|
+
# 處理 CBETA XML P5a
|
15
|
+
#
|
16
|
+
# CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
|
17
|
+
#
|
18
|
+
# 轉檔規則請參考: http://wiki.ddbc.edu.tw/pages/CBETA_XML_P5a_轉_HTML
|
19
|
+
class CBETA::P5aToHTML
|
20
|
+
|
21
|
+
# xml_root:: 來源 CBETA XML P5a 路徑
|
22
|
+
# out_root:: 輸出 HTML 路徑
|
23
|
+
def initialize(xml_root, out_root)
|
24
|
+
@xml_root = xml_root
|
25
|
+
@out_root = out_root
|
26
|
+
@gaijis = CBETA::Gaiji.new
|
27
|
+
|
28
|
+
# 載入 unicode 1.1 字集列表
|
29
|
+
fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
|
30
|
+
json = File.read(fn)
|
31
|
+
@unicode1 = JSON.parse(json)
|
32
|
+
end
|
33
|
+
|
34
|
+
# 將 CBETA XML P5a 轉為 HTML
|
35
|
+
#
|
36
|
+
# 例如 轉出大正藏第一冊
|
37
|
+
#
|
38
|
+
# x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
39
|
+
# x2h.convert('T01')
|
40
|
+
#
|
41
|
+
# 例如 轉出大正藏全部
|
42
|
+
#
|
43
|
+
# x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
44
|
+
# x2h.convert('T')
|
45
|
+
#
|
46
|
+
# T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
|
47
|
+
def convert(arg=nil)
|
48
|
+
return convert_all if arg.nil?
|
49
|
+
|
50
|
+
arg.upcase!
|
51
|
+
if arg.size == 1
|
52
|
+
handle_collection(arg)
|
53
|
+
else
|
54
|
+
if arg.include? '..'
|
55
|
+
arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
|
56
|
+
handle_vols($1, $2)
|
57
|
+
}
|
58
|
+
else
|
59
|
+
handle_vol(arg)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def convert_all
|
67
|
+
Dir.foreach(@xml_root) { |c|
|
68
|
+
next unless c.match(/^[A-Z]$/)
|
69
|
+
handle_collection(c)
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
def handle_anchor(e)
|
74
|
+
id = e['id']
|
75
|
+
if e.has_attribute?('id')
|
76
|
+
if id.start_with?('nkr_note_orig')
|
77
|
+
note = @notes[id]
|
78
|
+
note_text = traverse(note)
|
79
|
+
n = id[/^nkr_note_orig_(.*)$/, 1]
|
80
|
+
@back[@juan] += "<span class='footnote' id='n#{n}'>#{note_text}</span>\n"
|
81
|
+
return "<a class='noteAnchor' href='#n#{n}'></a>"
|
82
|
+
elsif id.start_with? 'fx'
|
83
|
+
return "<span class='star'>[*]</span>"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
if e.has_attribute?('type')
|
88
|
+
if e['type'] == 'circle'
|
89
|
+
return '◎'
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
''
|
94
|
+
end
|
95
|
+
|
96
|
+
def handle_app(e)
|
97
|
+
r = ''
|
98
|
+
if e['type'] == 'star'
|
99
|
+
c = e['corresp'][1..-1]
|
100
|
+
r = "<a class='noteAnchor star' href='#n#{c}'></a>"
|
101
|
+
end
|
102
|
+
r + traverse(e)
|
103
|
+
end
|
104
|
+
|
105
|
+
def handle_byline(e)
|
106
|
+
r = '<p class="byline">'
|
107
|
+
r += "<span class='lineInfo'>#{@lb}</span>"
|
108
|
+
r += traverse(e)
|
109
|
+
r + '</p>'
|
110
|
+
end
|
111
|
+
|
112
|
+
def handle_collection(c)
|
113
|
+
@series = c
|
114
|
+
puts 'handle_collection ' + c
|
115
|
+
folder = File.join(@xml_root, @series)
|
116
|
+
Dir.foreach(folder) { |vol|
|
117
|
+
next if ['.', '..', '.DS_Store'].include? vol
|
118
|
+
handle_vol(vol)
|
119
|
+
}
|
120
|
+
end
|
121
|
+
|
122
|
+
def handle_corr(e)
|
123
|
+
r = ''
|
124
|
+
if e.parent.name == 'choice'
|
125
|
+
sic = e.parent.at_xpath('sic')
|
126
|
+
unless sic.nil?
|
127
|
+
@dila_note += 1
|
128
|
+
r = "<a class='noteAnchor dila' href='#dila_note#{@dila_note}'></a>"
|
129
|
+
|
130
|
+
note = @orig
|
131
|
+
sic_text = traverse(sic, 'back')
|
132
|
+
if sic_text.empty?
|
133
|
+
note += MISSING
|
134
|
+
else
|
135
|
+
note += sic_text
|
136
|
+
end
|
137
|
+
@back[@juan] += "<span class='footnote_dila' id='dila_note#{@dila_note}'>#{note}</span>\n"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
r + "<span class='cbeta'>%s</span>" % traverse(e)
|
141
|
+
end
|
142
|
+
|
143
|
+
def handle_div(e)
|
144
|
+
@div_count += 1
|
145
|
+
n = @div_count
|
146
|
+
if e.has_attribute? 'type'
|
147
|
+
@open_divs << e
|
148
|
+
r = traverse(e)
|
149
|
+
@open_divs.pop
|
150
|
+
return "<!-- begin div#{n}--><div class='div-#{e['type']}'>#{r}</div><!-- end of div#{n} -->"
|
151
|
+
else
|
152
|
+
return traverse(e)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def handle_figure(e)
|
157
|
+
"<p class='figure'>%s</p>" % traverse(e)
|
158
|
+
end
|
159
|
+
|
160
|
+
def handle_g(e, mode)
|
161
|
+
# if 有 <mapping type="unicode">
|
162
|
+
# if 在 unicode 1.1 範圍裡
|
163
|
+
# 直接採用
|
164
|
+
# else
|
165
|
+
# 預設呈現 unicode, 但仍包缺字資訊,供點選開 popup
|
166
|
+
# else if 有 <mapping type="normal_unicode">
|
167
|
+
# 預設呈現 normal_unicode, 但仍包缺字資訊,供點選開 popup
|
168
|
+
# else if 有 normalized form
|
169
|
+
# 預設呈現 normalized form, 但仍包缺字資訊,供點選開 popup
|
170
|
+
# else
|
171
|
+
# 預設呈現組字式, 但仍包缺字資訊,供點選開 popup
|
172
|
+
gid = e['ref'][1..-1]
|
173
|
+
g = @gaijis[gid]
|
174
|
+
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
175
|
+
zzs = g['zzs']
|
176
|
+
|
177
|
+
if mode == 'txt'
|
178
|
+
return g['roman'] if gid.start_with?('SD')
|
179
|
+
if zzs.nil?
|
180
|
+
abort "缺組字式:#{g}"
|
181
|
+
else
|
182
|
+
return zzs
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
@char_count += 1
|
187
|
+
|
188
|
+
if gid.start_with?('SD')
|
189
|
+
case gid
|
190
|
+
when 'SD-E35A'
|
191
|
+
return '('
|
192
|
+
when 'SD-E35B'
|
193
|
+
return ')'
|
194
|
+
else
|
195
|
+
return "<span class='siddam' roman='#{g['roman']}' code='#{gid}' char='#{g['sd-char']}'/>"
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
if gid.start_with?('RJ')
|
200
|
+
return "<span class='ranja' roman='#{g['roman']}' code='#{gid}' char='#{g['rj-char']}'/>"
|
201
|
+
end
|
202
|
+
|
203
|
+
default = ''
|
204
|
+
if g.has_key?('unicode')
|
205
|
+
if @unicode1.include?(g['unicode'])
|
206
|
+
return g['unicode-char'] # unicode 1.1 直接用
|
207
|
+
else
|
208
|
+
default = g['unicode-char']
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
nor = ''
|
213
|
+
if g.has_key?('normal_unicode')
|
214
|
+
nor = g['normal_unicode']
|
215
|
+
default = nor if default.empty?
|
216
|
+
end
|
217
|
+
|
218
|
+
if g.has_key?('normal')
|
219
|
+
nor += ', ' unless nor==''
|
220
|
+
nor += g['normal']
|
221
|
+
default = g['normal'] if default.empty?
|
222
|
+
end
|
223
|
+
|
224
|
+
default = zzs if default.empty?
|
225
|
+
|
226
|
+
href = 'http://dict.cbeta.org/dict_word/gaiji-cb/%s/%s.gif' % [gid[2, 2], gid]
|
227
|
+
unless @back[@juan].include?(href)
|
228
|
+
@back[@juan] += "<span id='#{gid}' class='gaijiInfo' figure_url='#{href}' zzs='#{zzs}' nor='#{nor}'>#{default}</span>\n"
|
229
|
+
end
|
230
|
+
"<a class='gaijiAnchor' href='##{gid}'>#{default}</a>"
|
231
|
+
end
|
232
|
+
|
233
|
+
def handle_graphic(e)
|
234
|
+
url = File.basename(e['url'])
|
235
|
+
"<span imgsrc='#{url}' class='graphic'></span>"
|
236
|
+
end
|
237
|
+
|
238
|
+
def handle_head(e)
|
239
|
+
r = ''
|
240
|
+
unless e['type'] == 'added'
|
241
|
+
r = "<p class='head'>%s</p>" % traverse(e)
|
242
|
+
end
|
243
|
+
r
|
244
|
+
end
|
245
|
+
|
246
|
+
def handle_item(e)
|
247
|
+
"<li>%s</li>\n" % traverse(e)
|
248
|
+
end
|
249
|
+
|
250
|
+
def handle_juan(e)
|
251
|
+
"<p class='juan'>%s</p>" % traverse(e)
|
252
|
+
end
|
253
|
+
|
254
|
+
def handle_l(e)
|
255
|
+
if @lg_type == 'abnormal'
|
256
|
+
return traverse(e)
|
257
|
+
end
|
258
|
+
|
259
|
+
@in_l = true
|
260
|
+
|
261
|
+
doc = Nokogiri::XML::Document.new
|
262
|
+
cell = doc.create_element('div')
|
263
|
+
cell['class'] = 'lg-cell'
|
264
|
+
cell.inner_html = traverse(e)
|
265
|
+
|
266
|
+
if @first_l
|
267
|
+
parent = e.parent()
|
268
|
+
if parent.has_attribute?('rend')
|
269
|
+
indent = parent['rend'].scan(/text-indent:[^:]*/)
|
270
|
+
unless indent.empty?
|
271
|
+
cell['style'] = indent[0]
|
272
|
+
end
|
273
|
+
end
|
274
|
+
@first_l = false
|
275
|
+
end
|
276
|
+
r = cell.to_s
|
277
|
+
|
278
|
+
unless @lg_row_open
|
279
|
+
r = "\n<div class='lg-row'>" + r
|
280
|
+
@lg_row_open = true
|
281
|
+
end
|
282
|
+
@in_l = false
|
283
|
+
r
|
284
|
+
end
|
285
|
+
|
286
|
+
def handle_lb(e)
|
287
|
+
@char_count = 1
|
288
|
+
@lb = e['n']
|
289
|
+
line_head = @sutra_no + '_p' + e['n']
|
290
|
+
r = ''
|
291
|
+
#if e.parent.name == 'lg' and $lg_row_open
|
292
|
+
if @lg_row_open && !@in_l
|
293
|
+
# 每行偈頌放在一個 lg-row 裡面
|
294
|
+
# T46n1937, p. 914a01, l 包雙行夾註跨行
|
295
|
+
# T20n1092, 337c16, lb 在 l 中間,不結束 lg-row
|
296
|
+
r += "</div><!-- end of lg-row -->"
|
297
|
+
@lg_row_open = false
|
298
|
+
end
|
299
|
+
r + "<span class='lb' \nid='#{line_head}'>#{line_head}</span>"
|
300
|
+
end
|
301
|
+
|
302
|
+
def handle_lem(e)
|
303
|
+
r = ''
|
304
|
+
w = e['wit']
|
305
|
+
if w.include? 'CBETA' and not w.include? @orig
|
306
|
+
@dila_note += 1
|
307
|
+
r = "<a class='noteAnchor dila' href='#dila_note#{@dila_note}'></a>"
|
308
|
+
r += "<span class='cbeta'>%s</span>" % traverse(e)
|
309
|
+
|
310
|
+
note = lem_note_cf(e)
|
311
|
+
note += lem_note_rdg(e)
|
312
|
+
@back[@juan] += "<span class='footnote_dila' id='dila_note#{@dila_note}'>#{note}</span>\n"
|
313
|
+
else
|
314
|
+
r = traverse(e)
|
315
|
+
end
|
316
|
+
r
|
317
|
+
end
|
318
|
+
|
319
|
+
def handle_lg(e)
|
320
|
+
r = ''
|
321
|
+
@lg_type = e['type']
|
322
|
+
if @lg_type == 'abnormal'
|
323
|
+
r = "<p class='lg-abnormal'>" + traverse(e) + "</p>"
|
324
|
+
else
|
325
|
+
@first_l = true
|
326
|
+
doc = Nokogiri::XML::Document.new
|
327
|
+
node = doc.create_element('div')
|
328
|
+
node['class'] = 'lg'
|
329
|
+
if e.has_attribute?('rend')
|
330
|
+
rend = e['rend'].gsub(/text-indent:[^:]*/, '')
|
331
|
+
node['style'] = rend
|
332
|
+
end
|
333
|
+
@lg_row_open = false
|
334
|
+
node.inner_html = traverse(e)
|
335
|
+
if @lg_row_open
|
336
|
+
node.inner_html += '</div><!-- end of lg -->'
|
337
|
+
@lg_row_open = false
|
338
|
+
end
|
339
|
+
r = "\n" + node.to_s
|
340
|
+
end
|
341
|
+
r
|
342
|
+
end
|
343
|
+
|
344
|
+
def handle_list(e)
|
345
|
+
"<ul>%s</ul>" % traverse(e)
|
346
|
+
end
|
347
|
+
|
348
|
+
def handle_milestone(e)
|
349
|
+
r = ''
|
350
|
+
if e['unit'] == 'juan'
|
351
|
+
|
352
|
+
r += "</div>" * @open_divs.size # 如果有 div 跨卷,要先結束, ex: T55n2154, p. 680a29, 跨 19, 20 兩卷
|
353
|
+
@juan = e['n'].to_i
|
354
|
+
@back[@juan] = @back[0]
|
355
|
+
r += "<juan #{@juan}>"
|
356
|
+
@open_divs.each { |d|
|
357
|
+
r += "<div class='#{d['type']}'>"
|
358
|
+
}
|
359
|
+
end
|
360
|
+
r
|
361
|
+
end
|
362
|
+
|
363
|
+
def handle_mulu(e)
|
364
|
+
r = ''
|
365
|
+
if e['type'] == '品'
|
366
|
+
@pass << false
|
367
|
+
r = "<mulu class='pin' s='%s'/>" % traverse(e, 'txt')
|
368
|
+
@pass.pop
|
369
|
+
end
|
370
|
+
r
|
371
|
+
end
|
372
|
+
|
373
|
+
def handle_node(e, mode)
|
374
|
+
return '' if e.comment?
|
375
|
+
return handle_text(e, mode) if e.text?
|
376
|
+
return '' if PASS.include?(e.name)
|
377
|
+
r = case e.name
|
378
|
+
when 'anchor' then handle_anchor(e)
|
379
|
+
when 'app' then handle_app(e)
|
380
|
+
when 'byline' then handle_byline(e)
|
381
|
+
when 'corr' then handle_corr(e)
|
382
|
+
when 'div' then handle_div(e)
|
383
|
+
when 'figure' then handle_figure(e)
|
384
|
+
when 'foreign' then ''
|
385
|
+
when 'g' then handle_g(e, mode)
|
386
|
+
when 'graphic' then handle_graphic(e)
|
387
|
+
when 'head' then handle_head(e)
|
388
|
+
when 'item' then handle_item(e)
|
389
|
+
when 'juan' then handle_juan(e)
|
390
|
+
when 'l' then handle_l(e)
|
391
|
+
when 'lb' then handle_lb(e)
|
392
|
+
when 'lem' then handle_lem(e)
|
393
|
+
when 'lg' then handle_lg(e)
|
394
|
+
when 'list' then handle_list(e)
|
395
|
+
when 'mulu' then handle_mulu(e)
|
396
|
+
when 'note' then handle_note(e)
|
397
|
+
when 'milestone' then handle_milestone(e)
|
398
|
+
when 'p' then handle_p(e)
|
399
|
+
when 'rdg' then ''
|
400
|
+
when 'reg' then ''
|
401
|
+
when 'sic' then ''
|
402
|
+
when 'sg' then handle_sg(e)
|
403
|
+
when 't' then handle_t(e)
|
404
|
+
else traverse(e)
|
405
|
+
end
|
406
|
+
r
|
407
|
+
end
|
408
|
+
|
409
|
+
|
410
|
+
def handle_note(e)
|
411
|
+
n = e['n']
|
412
|
+
if e.has_attribute?('type')
|
413
|
+
t = e['type']
|
414
|
+
case t
|
415
|
+
when 'equivalent'
|
416
|
+
return ''
|
417
|
+
when 'orig'
|
418
|
+
@pass << false
|
419
|
+
s = traverse(e)
|
420
|
+
@pass.pop
|
421
|
+
@back[@juan] += "<span class='footnote_orig' id='n#{n}'>#{s}</span>\n"
|
422
|
+
|
423
|
+
if @mod_notes.include? n
|
424
|
+
return ''
|
425
|
+
else
|
426
|
+
return "<a class='noteAnchor' href='#n#{n}'></a>"
|
427
|
+
end
|
428
|
+
when 'mod'
|
429
|
+
@pass << false
|
430
|
+
s = traverse(e)
|
431
|
+
@pass.pop
|
432
|
+
@back[@juan] += "<span class='footnote_cb' id='n#{n}'>#{s}</span>\n"
|
433
|
+
return "<a class='noteAnchor' href='#n#{n}'></a>"
|
434
|
+
when 'rest'
|
435
|
+
return ''
|
436
|
+
else
|
437
|
+
return '' if t.start_with?('cf')
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
if e.has_attribute?('resp')
|
442
|
+
return '' if e['resp'].start_with? 'CBETA'
|
443
|
+
end
|
444
|
+
|
445
|
+
if e.has_attribute?('place') && e['place']=='inline'
|
446
|
+
r = traverse(e)
|
447
|
+
return "<span class='doube-line-note'>#{r}</span>"
|
448
|
+
else
|
449
|
+
return traverse(e)
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
def handle_p(e)
|
454
|
+
r = '<p>'
|
455
|
+
r += "<span class='lineInfo'>#{@lb}</span>"
|
456
|
+
r += traverse(e)
|
457
|
+
r + '</p>'
|
458
|
+
end
|
459
|
+
|
460
|
+
def handle_sg(e)
|
461
|
+
'(' + traverse(e) + ')'
|
462
|
+
end
|
463
|
+
|
464
|
+
def handle_sutra(xml_fn)
|
465
|
+
puts "handle sutra #{xml_fn}"
|
466
|
+
@back = { 0 => '' }
|
467
|
+
@char_count = 1
|
468
|
+
@dila_note = 0
|
469
|
+
@div_count = 0
|
470
|
+
@in_l = false
|
471
|
+
@juan = 0
|
472
|
+
@lg_row_open = false
|
473
|
+
@mod_notes = Set.new
|
474
|
+
@open_divs = []
|
475
|
+
@sutra_no = File.basename(xml_fn, ".xml")
|
476
|
+
|
477
|
+
text = parse_xml(xml_fn)
|
478
|
+
|
479
|
+
# 註標移到 lg-cell 裡面,不然以 table 呈現 lg 會有問題
|
480
|
+
text.gsub!(/(<a class='noteAnchor'[^>]*><\/a>)(<div class="lg-cell"[^>]*>)/, '\2\1')
|
481
|
+
|
482
|
+
juans = text.split(/(<juan \d+>)/)
|
483
|
+
open = false
|
484
|
+
fo = nil
|
485
|
+
juan_no = nil
|
486
|
+
fn = ''
|
487
|
+
buf = ''
|
488
|
+
# 一卷一檔
|
489
|
+
juans.each { |j|
|
490
|
+
if j =~ /<juan (\d+)>$/
|
491
|
+
juan_no = $1.to_i
|
492
|
+
fn = "#{@sutra_no}_%03d.htm" % juan_no
|
493
|
+
output_path = File.join(@out_folder, fn)
|
494
|
+
fo = File.open(output_path, 'w')
|
495
|
+
open = true
|
496
|
+
s = <<eos
|
497
|
+
<html>
|
498
|
+
<head>
|
499
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
500
|
+
<meta name="filename" content="#{fn}" />
|
501
|
+
<title>#{@title}</title>
|
502
|
+
</head>
|
503
|
+
<body>
|
504
|
+
<!--
|
505
|
+
來源 XML CBETA P5a: https://github.com/cbeta-org/xml-p5a.git
|
506
|
+
轉檔程式: Dropbox/DILA-DA/cbeta-html/bin/x2h.rb Version #{Date.today}
|
507
|
+
說明文件: http://wiki.ddbc.edu.tw/pages/CBETA_XML_P5a_%E8%BD%89_HTML
|
508
|
+
-->
|
509
|
+
<div id='body'>
|
510
|
+
eos
|
511
|
+
fo.write(s)
|
512
|
+
fo.write(buf)
|
513
|
+
buf = ''
|
514
|
+
elsif open
|
515
|
+
fo.write(j + "\n</div><!-- end of div[@id='body'] -->\n")
|
516
|
+
fo.write("<div id='back'>\n" + @back[juan_no] + "</div>\n")
|
517
|
+
fo.write('</body></html>')
|
518
|
+
fo.close
|
519
|
+
else
|
520
|
+
buf = j
|
521
|
+
end
|
522
|
+
}
|
523
|
+
end
|
524
|
+
|
525
|
+
def handle_t(e)
|
526
|
+
if e.has_attribute? 'place'
|
527
|
+
return '' if e['place'].include? 'foot'
|
528
|
+
end
|
529
|
+
traverse(e)
|
530
|
+
end
|
531
|
+
|
532
|
+
def handle_text(e, mode)
|
533
|
+
s = e.content().chomp
|
534
|
+
return '' if s.empty?
|
535
|
+
return '' if e.parent.name == 'app'
|
536
|
+
|
537
|
+
# cbeta xml 文字之間會有多餘的換行
|
538
|
+
r = s.gsub(/[\n\r]/, '')
|
539
|
+
|
540
|
+
# 把 & 轉為 &
|
541
|
+
r = CGI.escapeHTML(r)
|
542
|
+
|
543
|
+
# 正文區的文字外面要包 span
|
544
|
+
if @pass.last and mode=='html'
|
545
|
+
r = "<span class='t' l='#{@lb}' w='#{@char_count}'>#{r}</span>"
|
546
|
+
@char_count += r.size
|
547
|
+
end
|
548
|
+
r
|
549
|
+
end
|
550
|
+
|
551
|
+
def handle_vol(vol)
|
552
|
+
puts 'x2h ' + vol
|
553
|
+
if vol.start_with? 'T'
|
554
|
+
@orig = "【大】"
|
555
|
+
else
|
556
|
+
abort "未處理底本"
|
557
|
+
end
|
558
|
+
@vol = vol
|
559
|
+
@series = vol[0]
|
560
|
+
@out_folder = File.join(@out_root, @series, vol)
|
561
|
+
FileUtils.remove_dir(@out_folder, force=true)
|
562
|
+
FileUtils::mkdir_p @out_folder
|
563
|
+
|
564
|
+
source = File.join(@xml_root, @series, vol)
|
565
|
+
Dir[source+"/*"].each { |f|
|
566
|
+
handle_sutra(f)
|
567
|
+
}
|
568
|
+
end
|
569
|
+
|
570
|
+
def handle_vols(v1, v2)
|
571
|
+
@series = v1[0]
|
572
|
+
folder = File.join(IN, @series)
|
573
|
+
Dir.foreach(folder) { |vol|
|
574
|
+
next if vol < v1
|
575
|
+
next if vol > v2
|
576
|
+
handle_vol(vol)
|
577
|
+
}
|
578
|
+
end
|
579
|
+
|
580
|
+
def lem_note_cf(e)
|
581
|
+
# ex: T32n1670A.xml, p. 703a16
|
582
|
+
# <note type="cf1">K30n1002_p0257a01-a23</note>
|
583
|
+
refs = []
|
584
|
+
e.xpath('./note').each { |n|
|
585
|
+
if n.key?('type') and n['type'].start_with? 'cf'
|
586
|
+
refs << n.content
|
587
|
+
end
|
588
|
+
}
|
589
|
+
if refs.empty?
|
590
|
+
''
|
591
|
+
else
|
592
|
+
'修訂依據:' + refs.join(';') + '。'
|
593
|
+
end
|
594
|
+
end
|
595
|
+
|
596
|
+
def lem_note_rdg(lem)
|
597
|
+
r = ''
|
598
|
+
app = lem.parent
|
599
|
+
@pass << false
|
600
|
+
app.xpath('rdg').each { |rdg|
|
601
|
+
if rdg['wit'].include? @orig
|
602
|
+
s = traverse(rdg, 'back')
|
603
|
+
s = MISSING if s.empty?
|
604
|
+
r += @orig + s
|
605
|
+
end
|
606
|
+
}
|
607
|
+
@pass.pop
|
608
|
+
r += '。' unless r.empty?
|
609
|
+
r
|
610
|
+
end
|
611
|
+
|
612
|
+
def open_xml(fn)
|
613
|
+
s = File.read(fn)
|
614
|
+
|
615
|
+
if fn.include? 'T16n0657'
|
616
|
+
# 這個地方 雙行夾註 跨兩行偈頌
|
617
|
+
# 把 lb 移到 note 結束之前
|
618
|
+
# 讓 lg-row 先結束,再結束雙行夾註
|
619
|
+
s.sub!(/(<\/note>)(\n<lb n="0206b29" ed="T"\/>)/, '\2\1')
|
620
|
+
end
|
621
|
+
|
622
|
+
# <milestone unit="juan"> 前面的 lb 屬於新的這一卷
|
623
|
+
s.gsub!(/((?:<pb [^>]+>\n?)?<lb [^>]+>\n?)(<milestone [^>]*unit="juan"\/>)/, '\2\1')
|
624
|
+
|
625
|
+
doc = Nokogiri::XML(s)
|
626
|
+
doc.remove_namespaces!()
|
627
|
+
doc
|
628
|
+
end
|
629
|
+
|
630
|
+
def read_mod_notes(doc)
|
631
|
+
doc.xpath("//note[@type='mod']").each { |e|
|
632
|
+
@mod_notes << e['n']
|
633
|
+
}
|
634
|
+
end
|
635
|
+
|
636
|
+
def parse_xml(xml_fn)
|
637
|
+
@pass = [false]
|
638
|
+
|
639
|
+
doc = open_xml(xml_fn)
|
640
|
+
|
641
|
+
e = doc.xpath("//titleStmt/title")[0]
|
642
|
+
@title = traverse(e, 'txt')
|
643
|
+
@title = @title.split()[-1]
|
644
|
+
|
645
|
+
read_mod_notes(doc)
|
646
|
+
|
647
|
+
root = doc.root()
|
648
|
+
body = root.xpath("text/body")[0]
|
649
|
+
@pass = [true]
|
650
|
+
|
651
|
+
text = traverse(body)
|
652
|
+
text
|
653
|
+
end
|
654
|
+
|
655
|
+
def traverse(e, mode='html')
|
656
|
+
r = ''
|
657
|
+
e.children.each { |c|
|
658
|
+
s = handle_node(c, mode)
|
659
|
+
r += s
|
660
|
+
}
|
661
|
+
r
|
662
|
+
end
|
663
|
+
|
664
|
+
end
|