cbeta 0.6.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +16 -2
- data/lib/cbeta/gaiji.rb +56 -1
- data/lib/cbeta/p5a_to_epub.rb +34 -12
- data/lib/cbeta/p5a_to_text.rb +10 -2
- data/lib/data/gaiji.json +74 -28
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e7b3afd9fe14f3c71a3f519aa0a15dc5b2e198ab
|
4
|
+
data.tar.gz: 898fddd049d4f076edb446c6f2751a10e41e248e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bd71006daf93b24dc46bd1afe004d1726cc6ef0b6de6159357ec01a3c255e732642f551037ffc8dc05be7e2997599bbb2ca015dcbc05bc497c029c903a0fca3c
|
7
|
+
data.tar.gz: f860ffec038a6924de638861a46e94c8bcb325a8114fb00aad19f898036c2722657229e4b25f83ac8b19143f65b100212041f4e51a95c3a208f632e163f7c276
|
data/lib/cbeta.rb
CHANGED
@@ -52,11 +52,25 @@ class CBETA
|
|
52
52
|
#
|
53
53
|
# @example
|
54
54
|
# cbeta = CBETA.new
|
55
|
-
# cbeta.
|
56
|
-
def
|
55
|
+
# cbeta.get_canon_symbol('T') # return "【大】"
|
56
|
+
def get_canon_symbol(id)
|
57
57
|
return nil unless @canon_abbr.key? id
|
58
58
|
@canon_abbr[id]
|
59
59
|
end
|
60
|
+
|
61
|
+
# 取得藏經略名
|
62
|
+
#
|
63
|
+
# @param id [String] 藏經 ID, 例如大正藏的 ID 是 "T"
|
64
|
+
# @return [String] 藏經短名,例如 "大"
|
65
|
+
#
|
66
|
+
# @example
|
67
|
+
# cbeta = CBETA.new
|
68
|
+
# cbeta.get_canon_abbr('T') # return "大"
|
69
|
+
def get_canon_abbr(id)
|
70
|
+
r = get_canon_symbol(id)
|
71
|
+
return nil if r.nil?
|
72
|
+
r.sub(/^【(.*?)】$/, '\1')
|
73
|
+
end
|
60
74
|
end
|
61
75
|
|
62
76
|
require 'cbeta/gaiji'
|
data/lib/cbeta/gaiji.rb
CHANGED
@@ -37,10 +37,65 @@ class CBETA::Gaiji
|
|
37
37
|
# @return [Array<String>]
|
38
38
|
#
|
39
39
|
# @example
|
40
|
-
# g =
|
40
|
+
# g = CBETA::Gaiji.new
|
41
41
|
# g.zhuyin("CB00023") # return [ "ㄍㄢˇ", "ㄍㄢ", "ㄧㄤˊ", "ㄇㄧˇ", "ㄇㄧㄝ", "ㄒㄧㄤˊ" ]
|
42
42
|
def zhuyin(cb)
|
43
43
|
return nil unless @gaijis.key? cb
|
44
44
|
@gaijis[cb]['zhuyin']
|
45
45
|
end
|
46
|
+
|
47
|
+
# 讀 XML P5 檔頭的缺字資料,更新現有缺字資料,輸出 JSON
|
48
|
+
def update_from_p5(p5_folder, output_json_filename)
|
49
|
+
update_from_p5_folder(p5_folder)
|
50
|
+
s = JSON.pretty_generate(@gaijis)
|
51
|
+
File.write(output_json_filename, s)
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
def char_to_hash(char)
|
56
|
+
r = {}
|
57
|
+
id = char['id']
|
58
|
+
char.xpath('charProp').each do |e|
|
59
|
+
prop = e.at('localName').text
|
60
|
+
case prop
|
61
|
+
when 'composition'
|
62
|
+
r['zzs'] = e.at('value').text
|
63
|
+
when 'normalized form'
|
64
|
+
r['normal'] = e.at('value').text
|
65
|
+
else
|
66
|
+
puts "未處理 charProp/localName: #{prop}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
char.xpath('mapping').each do |e|
|
70
|
+
case e['type']
|
71
|
+
when 'unicode'
|
72
|
+
u = e.text[2..-1]
|
73
|
+
r['unicode'] = u
|
74
|
+
r['unicode-char'] = [u.hex].pack('U')
|
75
|
+
end
|
76
|
+
end
|
77
|
+
r
|
78
|
+
end
|
79
|
+
|
80
|
+
def update_from_p5_file(fn)
|
81
|
+
f = File.open(fn)
|
82
|
+
doc = Nokogiri::XML(f)
|
83
|
+
f.close
|
84
|
+
doc.remove_namespaces!()
|
85
|
+
doc.xpath("//charDecl/char").each do |char|
|
86
|
+
@gaijis[char['id']] = char_to_hash(char)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def update_from_p5_folder(folder)
|
91
|
+
Dir.entries(folder).each do |f|
|
92
|
+
path = File.join(folder, f)
|
93
|
+
next if f.start_with? '.'
|
94
|
+
if Dir.exist? path
|
95
|
+
update_from_p5_folder path
|
96
|
+
else
|
97
|
+
update_from_p5_file path
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
46
101
|
end
|
data/lib/cbeta/p5a_to_epub.rb
CHANGED
@@ -323,14 +323,6 @@ eos
|
|
323
323
|
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
324
324
|
zzs = g['zzs']
|
325
325
|
|
326
|
-
if mode == 'txt'
|
327
|
-
return g['roman'] if gid.start_with?('SD')
|
328
|
-
if zzs.nil?
|
329
|
-
abort "缺組字式:#{g}"
|
330
|
-
else
|
331
|
-
return zzs
|
332
|
-
end
|
333
|
-
end
|
334
326
|
|
335
327
|
if gid.start_with?('SD')
|
336
328
|
case gid
|
@@ -339,14 +331,43 @@ eos
|
|
339
331
|
when 'SD-E35B'
|
340
332
|
return ')'
|
341
333
|
else
|
342
|
-
return g['roman']
|
334
|
+
return g['roman'] if g.key? 'roman'
|
335
|
+
|
336
|
+
if mode == 'txt'
|
337
|
+
puts "警告:純文字模式出現悉曇字:#{gid}"
|
338
|
+
return gid
|
339
|
+
else
|
340
|
+
# 如果沒有羅馬轉寫就顯示圖檔
|
341
|
+
src = File.join(@settings[:graphic_base], 'sd-gif', gid[3..4], gid+'.gif')
|
342
|
+
basename = File.basename(src)
|
343
|
+
dest = File.join(@temp_folder, 'img', basename)
|
344
|
+
FileUtils.copy(src, dest)
|
345
|
+
return "<img src='../img/#{basename}' />"
|
346
|
+
end
|
343
347
|
end
|
344
348
|
end
|
345
349
|
|
346
350
|
if gid.start_with?('RJ')
|
347
|
-
return g['roman']
|
351
|
+
return g['roman'] if g.key? 'roman'
|
352
|
+
|
353
|
+
if mode == 'txt'
|
354
|
+
puts "警告:純文字模式出現蘭札體:#{gid}"
|
355
|
+
return gid
|
356
|
+
else
|
357
|
+
# 如果沒有羅馬轉寫就顯示圖檔
|
358
|
+
src = File.join(@settings[:graphic_base], 'rj-gif', gid[3..4], gid+'.gif')
|
359
|
+
basename = File.basename(src)
|
360
|
+
dest = File.join(@temp_folder, 'img', basename)
|
361
|
+
FileUtils.copy(src, dest)
|
362
|
+
return "<img src='../img/#{basename}' />"
|
363
|
+
end
|
348
364
|
end
|
349
|
-
|
365
|
+
|
366
|
+
if mode == 'txt'
|
367
|
+
abort "缺組字式:#{g}" if zzs.nil?
|
368
|
+
return zzs
|
369
|
+
end
|
370
|
+
|
350
371
|
default = ''
|
351
372
|
if g.has_key?('unicode')
|
352
373
|
if @unicode1.include?(g['unicode'])
|
@@ -359,7 +380,7 @@ eos
|
|
359
380
|
|
360
381
|
def handle_graphic(e)
|
361
382
|
url = e['url']
|
362
|
-
url.sub!(/^.*figures
|
383
|
+
url.sub!(/^.*(figures\/.*)$/, '\1')
|
363
384
|
|
364
385
|
src = File.join(@settings[:graphic_base], url)
|
365
386
|
basename = File.basename(src)
|
@@ -516,6 +537,7 @@ eos
|
|
516
537
|
return '' if e.comment?
|
517
538
|
return handle_text(e, mode) if e.text?
|
518
539
|
return '' if PASS.include?(e.name)
|
540
|
+
|
519
541
|
r = case e.name
|
520
542
|
when 'anchor' then handle_anchor(e)
|
521
543
|
when 'app' then handle_app(e)
|
data/lib/cbeta/p5a_to_text.rb
CHANGED
@@ -15,6 +15,10 @@ require 'set'
|
|
15
15
|
# c.convert('T01')
|
16
16
|
#
|
17
17
|
class CBETA::P5aToText
|
18
|
+
# 內容不輸出的元素
|
19
|
+
PASS=['back', 'teiHeader']
|
20
|
+
|
21
|
+
private_constant :PASS
|
18
22
|
|
19
23
|
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
20
24
|
# @param output_root [String] 輸出 Text 路徑
|
@@ -444,7 +448,7 @@ class CBETA::P5aToText
|
|
444
448
|
def handle_vol(vol)
|
445
449
|
puts "convert volumn: #{vol}"
|
446
450
|
|
447
|
-
@orig = @cbeta.
|
451
|
+
@orig = @cbeta.get_canon_symbol(vol[0])
|
448
452
|
abort "未處理底本" if @orig.nil?
|
449
453
|
|
450
454
|
@vol = vol
|
@@ -508,7 +512,11 @@ class CBETA::P5aToText
|
|
508
512
|
text = frag.content
|
509
513
|
text = appify(text) if @format == 'app'
|
510
514
|
|
511
|
-
|
515
|
+
if ed == @orig
|
516
|
+
fn = "#{ed}-orig.txt"
|
517
|
+
else
|
518
|
+
fn = "#{ed}.txt"
|
519
|
+
end
|
512
520
|
output_path = File.join(folder, fn)
|
513
521
|
File.write(output_path, text)
|
514
522
|
end
|
data/lib/data/gaiji.json
CHANGED
@@ -50,10 +50,7 @@
|
|
50
50
|
"CB00178": {
|
51
51
|
"zzs": "[木*奈]",
|
52
52
|
"unicode": "3B88",
|
53
|
-
"unicode-char": "㮈"
|
54
|
-
"zhuyin": [
|
55
|
-
"ㄋㄞˋ"
|
56
|
-
]
|
53
|
+
"unicode-char": "㮈"
|
57
54
|
},
|
58
55
|
"CB00238": {
|
59
56
|
"zzs": "[打-丁+毛]",
|
@@ -107,10 +104,7 @@
|
|
107
104
|
"zzs": "[馬*犬]",
|
108
105
|
"normal": "馱",
|
109
106
|
"unicode": "4B7E",
|
110
|
-
"unicode-char": "䭾"
|
111
|
-
"zhuyin": [
|
112
|
-
"ㄊㄨㄛˊ"
|
113
|
-
]
|
107
|
+
"unicode-char": "䭾"
|
114
108
|
},
|
115
109
|
"CB00509": {
|
116
110
|
"zzs": "[商/衣]",
|
@@ -2933,8 +2927,7 @@
|
|
2933
2927
|
},
|
2934
2928
|
"CB04775": {
|
2935
2929
|
"zzs": "[柷-口+登]",
|
2936
|
-
"normal": "凳"
|
2937
|
-
"normal_unicode": "櫈"
|
2930
|
+
"normal": "凳"
|
2938
2931
|
},
|
2939
2932
|
"CB00144": {
|
2940
2933
|
"zzs": "[少/免]",
|
@@ -3403,10 +3396,7 @@
|
|
3403
3396
|
"CB05105": {
|
3404
3397
|
"zzs": "[契-大+石]",
|
3405
3398
|
"unicode": "40AE",
|
3406
|
-
"unicode-char": "䂮"
|
3407
|
-
"zhuyin": [
|
3408
|
-
"ㄌㄩㄝˋ"
|
3409
|
-
]
|
3399
|
+
"unicode-char": "䂮"
|
3410
3400
|
},
|
3411
3401
|
"SD-CFC5": {
|
3412
3402
|
"sd-char": "狣",
|
@@ -42271,10 +42261,7 @@
|
|
42271
42261
|
"CB06535": {
|
42272
42262
|
"zzs": "[自/本]",
|
42273
42263
|
"unicode": "2690E",
|
42274
|
-
"unicode-char": "𦤎"
|
42275
|
-
"zhuyin": [
|
42276
|
-
"ㄍㄠ"
|
42277
|
-
]
|
42264
|
+
"unicode-char": "𦤎"
|
42278
42265
|
},
|
42279
42266
|
"CB06952": {
|
42280
42267
|
"zzs": "[狂-王+羊]",
|
@@ -45192,11 +45179,7 @@
|
|
45192
45179
|
"CB05711": {
|
45193
45180
|
"zzs": "[颱-台+日]",
|
45194
45181
|
"unicode": "4AFB",
|
45195
|
-
"unicode-char": "䫻"
|
45196
|
-
"zhuyin": [
|
45197
|
-
"ㄒㄩㄝˊ",
|
45198
|
-
"ㄩˋ"
|
45199
|
-
]
|
45182
|
+
"unicode-char": "䫻"
|
45200
45183
|
},
|
45201
45184
|
"CB15400": {
|
45202
45185
|
"zzs": "[身*國]",
|
@@ -49191,11 +49174,7 @@
|
|
49191
49174
|
"zzs": "[仁-二+嶲]",
|
49192
49175
|
"normal": "俊",
|
49193
49176
|
"unicode": "349E",
|
49194
|
-
"unicode-char": "㒞"
|
49195
|
-
"zhuyin": [
|
49196
|
-
"ㄐㄩㄣˋ",
|
49197
|
-
"ㄎㄜˇ"
|
49198
|
-
]
|
49177
|
+
"unicode-char": "㒞"
|
49199
49178
|
},
|
49200
49179
|
"CB13910": {
|
49201
49180
|
"zzs": "[打-丁+閵]"
|
@@ -126184,5 +126163,72 @@
|
|
126184
126163
|
"ㄅㄧㄣˇ",
|
126185
126164
|
"ㄌㄧㄣˇ"
|
126186
126165
|
]
|
126166
|
+
},
|
126167
|
+
"CB32783": {
|
126168
|
+
"zzs": "[(糸*子)/心]"
|
126169
|
+
},
|
126170
|
+
"CB32784": {
|
126171
|
+
"zzs": "[受-又+(撤-育)]"
|
126172
|
+
},
|
126173
|
+
"CB32785": {
|
126174
|
+
"zzs": "[烈-列+((白/匕)*旡)]"
|
126175
|
+
},
|
126176
|
+
"CB32781": {
|
126177
|
+
"zzs": "[虫*雷]",
|
126178
|
+
"unicode": "274BD",
|
126179
|
+
"unicode-char": "𧒽"
|
126180
|
+
},
|
126181
|
+
"CB32792": {
|
126182
|
+
"zzs": "[番*韭]",
|
126183
|
+
"unicode": "2940F",
|
126184
|
+
"unicode-char": "𩐏"
|
126185
|
+
},
|
126186
|
+
"CB32793": {
|
126187
|
+
"zzs": "[米*(產-文+(立-一))]"
|
126188
|
+
},
|
126189
|
+
"CB32821": {
|
126190
|
+
"zzs": "[入/耳]",
|
126191
|
+
"normal": "聞",
|
126192
|
+
"unicode": "26535",
|
126193
|
+
"unicode-char": "𦔵"
|
126194
|
+
},
|
126195
|
+
"CB32825": {
|
126196
|
+
"zzs": "[舟*定]",
|
126197
|
+
"unicode": "26A58",
|
126198
|
+
"unicode-char": "𦩘"
|
126199
|
+
},
|
126200
|
+
"CB32828": {
|
126201
|
+
"zzs": "[打-丁+審]",
|
126202
|
+
"unicode": "22E19",
|
126203
|
+
"unicode-char": "𢸙"
|
126204
|
+
},
|
126205
|
+
"CB32832": {
|
126206
|
+
"zzs": "[卄/(袖-由+任)]"
|
126207
|
+
},
|
126208
|
+
"CB32830": {
|
126209
|
+
"zzs": "[烈-列+毛]",
|
126210
|
+
"unicode": "241AC",
|
126211
|
+
"unicode-char": "𤆬"
|
126212
|
+
},
|
126213
|
+
"CB32983": {
|
126214
|
+
"zzs": "[雨/如]",
|
126215
|
+
"unicode": "290B0",
|
126216
|
+
"unicode-char": "𩂰"
|
126217
|
+
},
|
126218
|
+
"CB32984": {
|
126219
|
+
"zzs": "[貝*昜]",
|
126220
|
+
"normal": "賜",
|
126221
|
+
"unicode": "27DBD",
|
126222
|
+
"unicode-char": "𧶽"
|
126223
|
+
},
|
126224
|
+
"CB32985": {
|
126225
|
+
"zzs": "[怡-台+志]",
|
126226
|
+
"unicode": "2267A",
|
126227
|
+
"unicode-char": "𢙺"
|
126228
|
+
},
|
126229
|
+
"CB32986": {
|
126230
|
+
"zzs": "[棣-木+王]",
|
126231
|
+
"unicode": "3ED6",
|
126232
|
+
"unicode-char": "㻖"
|
126187
126233
|
}
|
126188
126234
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-24 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|