cbeta 0.6.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +16 -2
- data/lib/cbeta/gaiji.rb +56 -1
- data/lib/cbeta/p5a_to_epub.rb +34 -12
- data/lib/cbeta/p5a_to_text.rb +10 -2
- data/lib/data/gaiji.json +74 -28
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e7b3afd9fe14f3c71a3f519aa0a15dc5b2e198ab
|
4
|
+
data.tar.gz: 898fddd049d4f076edb446c6f2751a10e41e248e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bd71006daf93b24dc46bd1afe004d1726cc6ef0b6de6159357ec01a3c255e732642f551037ffc8dc05be7e2997599bbb2ca015dcbc05bc497c029c903a0fca3c
|
7
|
+
data.tar.gz: f860ffec038a6924de638861a46e94c8bcb325a8114fb00aad19f898036c2722657229e4b25f83ac8b19143f65b100212041f4e51a95c3a208f632e163f7c276
|
data/lib/cbeta.rb
CHANGED
@@ -52,11 +52,25 @@ class CBETA
|
|
52
52
|
#
|
53
53
|
# @example
|
54
54
|
# cbeta = CBETA.new
|
55
|
-
# cbeta.
|
56
|
-
def
|
55
|
+
# cbeta.get_canon_symbol('T') # return "【大】"
|
56
|
+
def get_canon_symbol(id)
|
57
57
|
return nil unless @canon_abbr.key? id
|
58
58
|
@canon_abbr[id]
|
59
59
|
end
|
60
|
+
|
61
|
+
# 取得藏經略名
|
62
|
+
#
|
63
|
+
# @param id [String] 藏經 ID, 例如大正藏的 ID 是 "T"
|
64
|
+
# @return [String] 藏經短名,例如 "大"
|
65
|
+
#
|
66
|
+
# @example
|
67
|
+
# cbeta = CBETA.new
|
68
|
+
# cbeta.get_canon_abbr('T') # return "大"
|
69
|
+
def get_canon_abbr(id)
|
70
|
+
r = get_canon_symbol(id)
|
71
|
+
return nil if r.nil?
|
72
|
+
r.sub(/^【(.*?)】$/, '\1')
|
73
|
+
end
|
60
74
|
end
|
61
75
|
|
62
76
|
require 'cbeta/gaiji'
|
data/lib/cbeta/gaiji.rb
CHANGED
@@ -37,10 +37,65 @@ class CBETA::Gaiji
|
|
37
37
|
# @return [Array<String>]
|
38
38
|
#
|
39
39
|
# @example
|
40
|
-
# g =
|
40
|
+
# g = CBETA::Gaiji.new
|
41
41
|
# g.zhuyin("CB00023") # return [ "ㄍㄢˇ", "ㄍㄢ", "ㄧㄤˊ", "ㄇㄧˇ", "ㄇㄧㄝ", "ㄒㄧㄤˊ" ]
|
42
42
|
def zhuyin(cb)
|
43
43
|
return nil unless @gaijis.key? cb
|
44
44
|
@gaijis[cb]['zhuyin']
|
45
45
|
end
|
46
|
+
|
47
|
+
# 讀 XML P5 檔頭的缺字資料,更新現有缺字資料,輸出 JSON
|
48
|
+
def update_from_p5(p5_folder, output_json_filename)
|
49
|
+
update_from_p5_folder(p5_folder)
|
50
|
+
s = JSON.pretty_generate(@gaijis)
|
51
|
+
File.write(output_json_filename, s)
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
def char_to_hash(char)
|
56
|
+
r = {}
|
57
|
+
id = char['id']
|
58
|
+
char.xpath('charProp').each do |e|
|
59
|
+
prop = e.at('localName').text
|
60
|
+
case prop
|
61
|
+
when 'composition'
|
62
|
+
r['zzs'] = e.at('value').text
|
63
|
+
when 'normalized form'
|
64
|
+
r['normal'] = e.at('value').text
|
65
|
+
else
|
66
|
+
puts "未處理 charProp/localName: #{prop}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
char.xpath('mapping').each do |e|
|
70
|
+
case e['type']
|
71
|
+
when 'unicode'
|
72
|
+
u = e.text[2..-1]
|
73
|
+
r['unicode'] = u
|
74
|
+
r['unicode-char'] = [u.hex].pack('U')
|
75
|
+
end
|
76
|
+
end
|
77
|
+
r
|
78
|
+
end
|
79
|
+
|
80
|
+
def update_from_p5_file(fn)
|
81
|
+
f = File.open(fn)
|
82
|
+
doc = Nokogiri::XML(f)
|
83
|
+
f.close
|
84
|
+
doc.remove_namespaces!()
|
85
|
+
doc.xpath("//charDecl/char").each do |char|
|
86
|
+
@gaijis[char['id']] = char_to_hash(char)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def update_from_p5_folder(folder)
|
91
|
+
Dir.entries(folder).each do |f|
|
92
|
+
path = File.join(folder, f)
|
93
|
+
next if f.start_with? '.'
|
94
|
+
if Dir.exist? path
|
95
|
+
update_from_p5_folder path
|
96
|
+
else
|
97
|
+
update_from_p5_file path
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
46
101
|
end
|
data/lib/cbeta/p5a_to_epub.rb
CHANGED
@@ -323,14 +323,6 @@ eos
|
|
323
323
|
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
324
324
|
zzs = g['zzs']
|
325
325
|
|
326
|
-
if mode == 'txt'
|
327
|
-
return g['roman'] if gid.start_with?('SD')
|
328
|
-
if zzs.nil?
|
329
|
-
abort "缺組字式:#{g}"
|
330
|
-
else
|
331
|
-
return zzs
|
332
|
-
end
|
333
|
-
end
|
334
326
|
|
335
327
|
if gid.start_with?('SD')
|
336
328
|
case gid
|
@@ -339,14 +331,43 @@ eos
|
|
339
331
|
when 'SD-E35B'
|
340
332
|
return ')'
|
341
333
|
else
|
342
|
-
return g['roman']
|
334
|
+
return g['roman'] if g.key? 'roman'
|
335
|
+
|
336
|
+
if mode == 'txt'
|
337
|
+
puts "警告:純文字模式出現悉曇字:#{gid}"
|
338
|
+
return gid
|
339
|
+
else
|
340
|
+
# 如果沒有羅馬轉寫就顯示圖檔
|
341
|
+
src = File.join(@settings[:graphic_base], 'sd-gif', gid[3..4], gid+'.gif')
|
342
|
+
basename = File.basename(src)
|
343
|
+
dest = File.join(@temp_folder, 'img', basename)
|
344
|
+
FileUtils.copy(src, dest)
|
345
|
+
return "<img src='../img/#{basename}' />"
|
346
|
+
end
|
343
347
|
end
|
344
348
|
end
|
345
349
|
|
346
350
|
if gid.start_with?('RJ')
|
347
|
-
return g['roman']
|
351
|
+
return g['roman'] if g.key? 'roman'
|
352
|
+
|
353
|
+
if mode == 'txt'
|
354
|
+
puts "警告:純文字模式出現蘭札體:#{gid}"
|
355
|
+
return gid
|
356
|
+
else
|
357
|
+
# 如果沒有羅馬轉寫就顯示圖檔
|
358
|
+
src = File.join(@settings[:graphic_base], 'rj-gif', gid[3..4], gid+'.gif')
|
359
|
+
basename = File.basename(src)
|
360
|
+
dest = File.join(@temp_folder, 'img', basename)
|
361
|
+
FileUtils.copy(src, dest)
|
362
|
+
return "<img src='../img/#{basename}' />"
|
363
|
+
end
|
348
364
|
end
|
349
|
-
|
365
|
+
|
366
|
+
if mode == 'txt'
|
367
|
+
abort "缺組字式:#{g}" if zzs.nil?
|
368
|
+
return zzs
|
369
|
+
end
|
370
|
+
|
350
371
|
default = ''
|
351
372
|
if g.has_key?('unicode')
|
352
373
|
if @unicode1.include?(g['unicode'])
|
@@ -359,7 +380,7 @@ eos
|
|
359
380
|
|
360
381
|
def handle_graphic(e)
|
361
382
|
url = e['url']
|
362
|
-
url.sub!(/^.*figures
|
383
|
+
url.sub!(/^.*(figures\/.*)$/, '\1')
|
363
384
|
|
364
385
|
src = File.join(@settings[:graphic_base], url)
|
365
386
|
basename = File.basename(src)
|
@@ -516,6 +537,7 @@ eos
|
|
516
537
|
return '' if e.comment?
|
517
538
|
return handle_text(e, mode) if e.text?
|
518
539
|
return '' if PASS.include?(e.name)
|
540
|
+
|
519
541
|
r = case e.name
|
520
542
|
when 'anchor' then handle_anchor(e)
|
521
543
|
when 'app' then handle_app(e)
|
data/lib/cbeta/p5a_to_text.rb
CHANGED
@@ -15,6 +15,10 @@ require 'set'
|
|
15
15
|
# c.convert('T01')
|
16
16
|
#
|
17
17
|
class CBETA::P5aToText
|
18
|
+
# 內容不輸出的元素
|
19
|
+
PASS=['back', 'teiHeader']
|
20
|
+
|
21
|
+
private_constant :PASS
|
18
22
|
|
19
23
|
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
20
24
|
# @param output_root [String] 輸出 Text 路徑
|
@@ -444,7 +448,7 @@ class CBETA::P5aToText
|
|
444
448
|
def handle_vol(vol)
|
445
449
|
puts "convert volumn: #{vol}"
|
446
450
|
|
447
|
-
@orig = @cbeta.
|
451
|
+
@orig = @cbeta.get_canon_symbol(vol[0])
|
448
452
|
abort "未處理底本" if @orig.nil?
|
449
453
|
|
450
454
|
@vol = vol
|
@@ -508,7 +512,11 @@ class CBETA::P5aToText
|
|
508
512
|
text = frag.content
|
509
513
|
text = appify(text) if @format == 'app'
|
510
514
|
|
511
|
-
|
515
|
+
if ed == @orig
|
516
|
+
fn = "#{ed}-orig.txt"
|
517
|
+
else
|
518
|
+
fn = "#{ed}.txt"
|
519
|
+
end
|
512
520
|
output_path = File.join(folder, fn)
|
513
521
|
File.write(output_path, text)
|
514
522
|
end
|
data/lib/data/gaiji.json
CHANGED
@@ -50,10 +50,7 @@
|
|
50
50
|
"CB00178": {
|
51
51
|
"zzs": "[木*奈]",
|
52
52
|
"unicode": "3B88",
|
53
|
-
"unicode-char": "㮈"
|
54
|
-
"zhuyin": [
|
55
|
-
"ㄋㄞˋ"
|
56
|
-
]
|
53
|
+
"unicode-char": "㮈"
|
57
54
|
},
|
58
55
|
"CB00238": {
|
59
56
|
"zzs": "[打-丁+毛]",
|
@@ -107,10 +104,7 @@
|
|
107
104
|
"zzs": "[馬*犬]",
|
108
105
|
"normal": "馱",
|
109
106
|
"unicode": "4B7E",
|
110
|
-
"unicode-char": "䭾"
|
111
|
-
"zhuyin": [
|
112
|
-
"ㄊㄨㄛˊ"
|
113
|
-
]
|
107
|
+
"unicode-char": "䭾"
|
114
108
|
},
|
115
109
|
"CB00509": {
|
116
110
|
"zzs": "[商/衣]",
|
@@ -2933,8 +2927,7 @@
|
|
2933
2927
|
},
|
2934
2928
|
"CB04775": {
|
2935
2929
|
"zzs": "[柷-口+登]",
|
2936
|
-
"normal": "凳"
|
2937
|
-
"normal_unicode": "櫈"
|
2930
|
+
"normal": "凳"
|
2938
2931
|
},
|
2939
2932
|
"CB00144": {
|
2940
2933
|
"zzs": "[少/免]",
|
@@ -3403,10 +3396,7 @@
|
|
3403
3396
|
"CB05105": {
|
3404
3397
|
"zzs": "[契-大+石]",
|
3405
3398
|
"unicode": "40AE",
|
3406
|
-
"unicode-char": "䂮"
|
3407
|
-
"zhuyin": [
|
3408
|
-
"ㄌㄩㄝˋ"
|
3409
|
-
]
|
3399
|
+
"unicode-char": "䂮"
|
3410
3400
|
},
|
3411
3401
|
"SD-CFC5": {
|
3412
3402
|
"sd-char": "狣",
|
@@ -42271,10 +42261,7 @@
|
|
42271
42261
|
"CB06535": {
|
42272
42262
|
"zzs": "[自/本]",
|
42273
42263
|
"unicode": "2690E",
|
42274
|
-
"unicode-char": "𦤎"
|
42275
|
-
"zhuyin": [
|
42276
|
-
"ㄍㄠ"
|
42277
|
-
]
|
42264
|
+
"unicode-char": "𦤎"
|
42278
42265
|
},
|
42279
42266
|
"CB06952": {
|
42280
42267
|
"zzs": "[狂-王+羊]",
|
@@ -45192,11 +45179,7 @@
|
|
45192
45179
|
"CB05711": {
|
45193
45180
|
"zzs": "[颱-台+日]",
|
45194
45181
|
"unicode": "4AFB",
|
45195
|
-
"unicode-char": "䫻"
|
45196
|
-
"zhuyin": [
|
45197
|
-
"ㄒㄩㄝˊ",
|
45198
|
-
"ㄩˋ"
|
45199
|
-
]
|
45182
|
+
"unicode-char": "䫻"
|
45200
45183
|
},
|
45201
45184
|
"CB15400": {
|
45202
45185
|
"zzs": "[身*國]",
|
@@ -49191,11 +49174,7 @@
|
|
49191
49174
|
"zzs": "[仁-二+嶲]",
|
49192
49175
|
"normal": "俊",
|
49193
49176
|
"unicode": "349E",
|
49194
|
-
"unicode-char": "㒞"
|
49195
|
-
"zhuyin": [
|
49196
|
-
"ㄐㄩㄣˋ",
|
49197
|
-
"ㄎㄜˇ"
|
49198
|
-
]
|
49177
|
+
"unicode-char": "㒞"
|
49199
49178
|
},
|
49200
49179
|
"CB13910": {
|
49201
49180
|
"zzs": "[打-丁+閵]"
|
@@ -126184,5 +126163,72 @@
|
|
126184
126163
|
"ㄅㄧㄣˇ",
|
126185
126164
|
"ㄌㄧㄣˇ"
|
126186
126165
|
]
|
126166
|
+
},
|
126167
|
+
"CB32783": {
|
126168
|
+
"zzs": "[(糸*子)/心]"
|
126169
|
+
},
|
126170
|
+
"CB32784": {
|
126171
|
+
"zzs": "[受-又+(撤-育)]"
|
126172
|
+
},
|
126173
|
+
"CB32785": {
|
126174
|
+
"zzs": "[烈-列+((白/匕)*旡)]"
|
126175
|
+
},
|
126176
|
+
"CB32781": {
|
126177
|
+
"zzs": "[虫*雷]",
|
126178
|
+
"unicode": "274BD",
|
126179
|
+
"unicode-char": "𧒽"
|
126180
|
+
},
|
126181
|
+
"CB32792": {
|
126182
|
+
"zzs": "[番*韭]",
|
126183
|
+
"unicode": "2940F",
|
126184
|
+
"unicode-char": "𩐏"
|
126185
|
+
},
|
126186
|
+
"CB32793": {
|
126187
|
+
"zzs": "[米*(產-文+(立-一))]"
|
126188
|
+
},
|
126189
|
+
"CB32821": {
|
126190
|
+
"zzs": "[入/耳]",
|
126191
|
+
"normal": "聞",
|
126192
|
+
"unicode": "26535",
|
126193
|
+
"unicode-char": "𦔵"
|
126194
|
+
},
|
126195
|
+
"CB32825": {
|
126196
|
+
"zzs": "[舟*定]",
|
126197
|
+
"unicode": "26A58",
|
126198
|
+
"unicode-char": "𦩘"
|
126199
|
+
},
|
126200
|
+
"CB32828": {
|
126201
|
+
"zzs": "[打-丁+審]",
|
126202
|
+
"unicode": "22E19",
|
126203
|
+
"unicode-char": "𢸙"
|
126204
|
+
},
|
126205
|
+
"CB32832": {
|
126206
|
+
"zzs": "[卄/(袖-由+任)]"
|
126207
|
+
},
|
126208
|
+
"CB32830": {
|
126209
|
+
"zzs": "[烈-列+毛]",
|
126210
|
+
"unicode": "241AC",
|
126211
|
+
"unicode-char": "𤆬"
|
126212
|
+
},
|
126213
|
+
"CB32983": {
|
126214
|
+
"zzs": "[雨/如]",
|
126215
|
+
"unicode": "290B0",
|
126216
|
+
"unicode-char": "𩂰"
|
126217
|
+
},
|
126218
|
+
"CB32984": {
|
126219
|
+
"zzs": "[貝*昜]",
|
126220
|
+
"normal": "賜",
|
126221
|
+
"unicode": "27DBD",
|
126222
|
+
"unicode-char": "𧶽"
|
126223
|
+
},
|
126224
|
+
"CB32985": {
|
126225
|
+
"zzs": "[怡-台+志]",
|
126226
|
+
"unicode": "2267A",
|
126227
|
+
"unicode-char": "𢙺"
|
126228
|
+
},
|
126229
|
+
"CB32986": {
|
126230
|
+
"zzs": "[棣-木+王]",
|
126231
|
+
"unicode": "3ED6",
|
126232
|
+
"unicode-char": "㻖"
|
126187
126233
|
}
|
126188
126234
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-24 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|