cbeta 0.6.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 20467d7b166bf2c6daf7ec2a174472ba7cbd6a56
4
- data.tar.gz: efcbf9c7d8c3cf5d71b371d0ef0745c6d3988466
3
+ metadata.gz: e7b3afd9fe14f3c71a3f519aa0a15dc5b2e198ab
4
+ data.tar.gz: 898fddd049d4f076edb446c6f2751a10e41e248e
5
5
  SHA512:
6
- metadata.gz: 41b01556ca22270c458d806a0b45833d4eb64b0c38d64110a39232e09dc925d3c5081fd382f75da4857d3c05177250541a03c22d5236ea1c799ee73a33fcdec7
7
- data.tar.gz: b4081c2d5e1f68a26072eba3409bd12372c28765487c8e12217a7e9c7389a1bf002e42af63c10a267e4dde7a87c4537654a5b146506d50263ec09b6a6842ddc0
6
+ metadata.gz: bd71006daf93b24dc46bd1afe004d1726cc6ef0b6de6159357ec01a3c255e732642f551037ffc8dc05be7e2997599bbb2ca015dcbc05bc497c029c903a0fca3c
7
+ data.tar.gz: f860ffec038a6924de638861a46e94c8bcb325a8114fb00aad19f898036c2722657229e4b25f83ac8b19143f65b100212041f4e51a95c3a208f632e163f7c276
data/lib/cbeta.rb CHANGED
@@ -52,11 +52,25 @@ class CBETA
52
52
  #
53
53
  # @example
54
54
  # cbeta = CBETA.new
55
- # cbeta.get_canon_abbr('T') # return "【大】"
56
- def get_canon_abbr(id)
55
+ # cbeta.get_canon_symbol('T') # return "【大】"
56
+ def get_canon_symbol(id)
57
57
  return nil unless @canon_abbr.key? id
58
58
  @canon_abbr[id]
59
59
  end
60
+
61
+ # 取得藏經略名
62
+ #
63
+ # @param id [String] 藏經 ID, 例如大正藏的 ID 是 "T"
64
+ # @return [String] 藏經短名,例如 "大"
65
+ #
66
+ # @example
67
+ # cbeta = CBETA.new
68
+ # cbeta.get_canon_abbr('T') # return "大"
69
+ def get_canon_abbr(id)
70
+ r = get_canon_symbol(id)
71
+ return nil if r.nil?
72
+ r.sub(/^【(.*?)】$/, '\1')
73
+ end
60
74
  end
61
75
 
62
76
  require 'cbeta/gaiji'
data/lib/cbeta/gaiji.rb CHANGED
@@ -37,10 +37,65 @@ class CBETA::Gaiji
37
37
  # @return [Array<String>]
38
38
  #
39
39
  # @example
40
- # g = Cbeta::Gaiji.new
40
+ # g = CBETA::Gaiji.new
41
41
  # g.zhuyin("CB00023") # return [ "ㄍㄢˇ", "ㄍㄢ", "ㄧㄤˊ", "ㄇㄧˇ", "ㄇㄧㄝ", "ㄒㄧㄤˊ" ]
42
42
  def zhuyin(cb)
43
43
  return nil unless @gaijis.key? cb
44
44
  @gaijis[cb]['zhuyin']
45
45
  end
46
+
47
+ # 讀 XML P5 檔頭的缺字資料,更新現有缺字資料,輸出 JSON
48
+ def update_from_p5(p5_folder, output_json_filename)
49
+ update_from_p5_folder(p5_folder)
50
+ s = JSON.pretty_generate(@gaijis)
51
+ File.write(output_json_filename, s)
52
+ end
53
+
54
+ private
55
+ def char_to_hash(char)
56
+ r = {}
57
+ id = char['id']
58
+ char.xpath('charProp').each do |e|
59
+ prop = e.at('localName').text
60
+ case prop
61
+ when 'composition'
62
+ r['zzs'] = e.at('value').text
63
+ when 'normalized form'
64
+ r['normal'] = e.at('value').text
65
+ else
66
+ puts "未處理 charProp/localName: #{prop}"
67
+ end
68
+ end
69
+ char.xpath('mapping').each do |e|
70
+ case e['type']
71
+ when 'unicode'
72
+ u = e.text[2..-1]
73
+ r['unicode'] = u
74
+ r['unicode-char'] = [u.hex].pack('U')
75
+ end
76
+ end
77
+ r
78
+ end
79
+
80
+ def update_from_p5_file(fn)
81
+ f = File.open(fn)
82
+ doc = Nokogiri::XML(f)
83
+ f.close
84
+ doc.remove_namespaces!()
85
+ doc.xpath("//charDecl/char").each do |char|
86
+ @gaijis[char['id']] = char_to_hash(char)
87
+ end
88
+ end
89
+
90
+ def update_from_p5_folder(folder)
91
+ Dir.entries(folder).each do |f|
92
+ path = File.join(folder, f)
93
+ next if f.start_with? '.'
94
+ if Dir.exist? path
95
+ update_from_p5_folder path
96
+ else
97
+ update_from_p5_file path
98
+ end
99
+ end
100
+ end
46
101
  end
@@ -323,14 +323,6 @@ eos
323
323
  abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
324
324
  zzs = g['zzs']
325
325
 
326
- if mode == 'txt'
327
- return g['roman'] if gid.start_with?('SD')
328
- if zzs.nil?
329
- abort "缺組字式:#{g}"
330
- else
331
- return zzs
332
- end
333
- end
334
326
 
335
327
  if gid.start_with?('SD')
336
328
  case gid
@@ -339,14 +331,43 @@ eos
339
331
  when 'SD-E35B'
340
332
  return ')'
341
333
  else
342
- return g['roman']
334
+ return g['roman'] if g.key? 'roman'
335
+
336
+ if mode == 'txt'
337
+ puts "警告:純文字模式出現悉曇字:#{gid}"
338
+ return gid
339
+ else
340
+ # 如果沒有羅馬轉寫就顯示圖檔
341
+ src = File.join(@settings[:graphic_base], 'sd-gif', gid[3..4], gid+'.gif')
342
+ basename = File.basename(src)
343
+ dest = File.join(@temp_folder, 'img', basename)
344
+ FileUtils.copy(src, dest)
345
+ return "<img src='../img/#{basename}' />"
346
+ end
343
347
  end
344
348
  end
345
349
 
346
350
  if gid.start_with?('RJ')
347
- return g['roman']
351
+ return g['roman'] if g.key? 'roman'
352
+
353
+ if mode == 'txt'
354
+ puts "警告:純文字模式出現蘭札體:#{gid}"
355
+ return gid
356
+ else
357
+ # 如果沒有羅馬轉寫就顯示圖檔
358
+ src = File.join(@settings[:graphic_base], 'rj-gif', gid[3..4], gid+'.gif')
359
+ basename = File.basename(src)
360
+ dest = File.join(@temp_folder, 'img', basename)
361
+ FileUtils.copy(src, dest)
362
+ return "<img src='../img/#{basename}' />"
363
+ end
348
364
  end
349
-
365
+
366
+ if mode == 'txt'
367
+ abort "缺組字式:#{g}" if zzs.nil?
368
+ return zzs
369
+ end
370
+
350
371
  default = ''
351
372
  if g.has_key?('unicode')
352
373
  if @unicode1.include?(g['unicode'])
@@ -359,7 +380,7 @@ eos
359
380
 
360
381
  def handle_graphic(e)
361
382
  url = e['url']
362
- url.sub!(/^.*figures\/(.*)$/, '\1')
383
+ url.sub!(/^.*(figures\/.*)$/, '\1')
363
384
 
364
385
  src = File.join(@settings[:graphic_base], url)
365
386
  basename = File.basename(src)
@@ -516,6 +537,7 @@ eos
516
537
  return '' if e.comment?
517
538
  return handle_text(e, mode) if e.text?
518
539
  return '' if PASS.include?(e.name)
540
+
519
541
  r = case e.name
520
542
  when 'anchor' then handle_anchor(e)
521
543
  when 'app' then handle_app(e)
@@ -15,6 +15,10 @@ require 'set'
15
15
  # c.convert('T01')
16
16
  #
17
17
  class CBETA::P5aToText
18
+ # 內容不輸出的元素
19
+ PASS=['back', 'teiHeader']
20
+
21
+ private_constant :PASS
18
22
 
19
23
  # @param xml_root [String] 來源 CBETA XML P5a 路徑
20
24
  # @param output_root [String] 輸出 Text 路徑
@@ -444,7 +448,7 @@ class CBETA::P5aToText
444
448
  def handle_vol(vol)
445
449
  puts "convert volumn: #{vol}"
446
450
 
447
- @orig = @cbeta.get_canon_abbr(vol[0])
451
+ @orig = @cbeta.get_canon_symbol(vol[0])
448
452
  abort "未處理底本" if @orig.nil?
449
453
 
450
454
  @vol = vol
@@ -508,7 +512,11 @@ class CBETA::P5aToText
508
512
  text = frag.content
509
513
  text = appify(text) if @format == 'app'
510
514
 
511
- fn = "#{ed}.txt"
515
+ if ed == @orig
516
+ fn = "#{ed}-orig.txt"
517
+ else
518
+ fn = "#{ed}.txt"
519
+ end
512
520
  output_path = File.join(folder, fn)
513
521
  File.write(output_path, text)
514
522
  end
data/lib/data/gaiji.json CHANGED
@@ -50,10 +50,7 @@
50
50
  "CB00178": {
51
51
  "zzs": "[木*奈]",
52
52
  "unicode": "3B88",
53
- "unicode-char": "㮈",
54
- "zhuyin": [
55
- "ㄋㄞˋ"
56
- ]
53
+ "unicode-char": "㮈"
57
54
  },
58
55
  "CB00238": {
59
56
  "zzs": "[打-丁+毛]",
@@ -107,10 +104,7 @@
107
104
  "zzs": "[馬*犬]",
108
105
  "normal": "馱",
109
106
  "unicode": "4B7E",
110
- "unicode-char": "䭾",
111
- "zhuyin": [
112
- "ㄊㄨㄛˊ"
113
- ]
107
+ "unicode-char": "䭾"
114
108
  },
115
109
  "CB00509": {
116
110
  "zzs": "[商/衣]",
@@ -2933,8 +2927,7 @@
2933
2927
  },
2934
2928
  "CB04775": {
2935
2929
  "zzs": "[柷-口+登]",
2936
- "normal": "凳",
2937
- "normal_unicode": "櫈"
2930
+ "normal": "凳"
2938
2931
  },
2939
2932
  "CB00144": {
2940
2933
  "zzs": "[少/免]",
@@ -3403,10 +3396,7 @@
3403
3396
  "CB05105": {
3404
3397
  "zzs": "[契-大+石]",
3405
3398
  "unicode": "40AE",
3406
- "unicode-char": "䂮",
3407
- "zhuyin": [
3408
- "ㄌㄩㄝˋ"
3409
- ]
3399
+ "unicode-char": "䂮"
3410
3400
  },
3411
3401
  "SD-CFC5": {
3412
3402
  "sd-char": "狣",
@@ -42271,10 +42261,7 @@
42271
42261
  "CB06535": {
42272
42262
  "zzs": "[自/本]",
42273
42263
  "unicode": "2690E",
42274
- "unicode-char": "𦤎",
42275
- "zhuyin": [
42276
- "ㄍㄠ"
42277
- ]
42264
+ "unicode-char": "𦤎"
42278
42265
  },
42279
42266
  "CB06952": {
42280
42267
  "zzs": "[狂-王+羊]",
@@ -45192,11 +45179,7 @@
45192
45179
  "CB05711": {
45193
45180
  "zzs": "[颱-台+日]",
45194
45181
  "unicode": "4AFB",
45195
- "unicode-char": "䫻",
45196
- "zhuyin": [
45197
- "ㄒㄩㄝˊ",
45198
- "ㄩˋ"
45199
- ]
45182
+ "unicode-char": "䫻"
45200
45183
  },
45201
45184
  "CB15400": {
45202
45185
  "zzs": "[身*國]",
@@ -49191,11 +49174,7 @@
49191
49174
  "zzs": "[仁-二+嶲]",
49192
49175
  "normal": "俊",
49193
49176
  "unicode": "349E",
49194
- "unicode-char": "㒞",
49195
- "zhuyin": [
49196
- "ㄐㄩㄣˋ",
49197
- "ㄎㄜˇ"
49198
- ]
49177
+ "unicode-char": "㒞"
49199
49178
  },
49200
49179
  "CB13910": {
49201
49180
  "zzs": "[打-丁+閵]"
@@ -126184,5 +126163,72 @@
126184
126163
  "ㄅㄧㄣˇ",
126185
126164
  "ㄌㄧㄣˇ"
126186
126165
  ]
126166
+ },
126167
+ "CB32783": {
126168
+ "zzs": "[(糸*子)/心]"
126169
+ },
126170
+ "CB32784": {
126171
+ "zzs": "[受-又+(撤-育)]"
126172
+ },
126173
+ "CB32785": {
126174
+ "zzs": "[烈-列+((白/匕)*旡)]"
126175
+ },
126176
+ "CB32781": {
126177
+ "zzs": "[虫*雷]",
126178
+ "unicode": "274BD",
126179
+ "unicode-char": "𧒽"
126180
+ },
126181
+ "CB32792": {
126182
+ "zzs": "[番*韭]",
126183
+ "unicode": "2940F",
126184
+ "unicode-char": "𩐏"
126185
+ },
126186
+ "CB32793": {
126187
+ "zzs": "[米*(產-文+(立-一))]"
126188
+ },
126189
+ "CB32821": {
126190
+ "zzs": "[入/耳]",
126191
+ "normal": "聞",
126192
+ "unicode": "26535",
126193
+ "unicode-char": "𦔵"
126194
+ },
126195
+ "CB32825": {
126196
+ "zzs": "[舟*定]",
126197
+ "unicode": "26A58",
126198
+ "unicode-char": "𦩘"
126199
+ },
126200
+ "CB32828": {
126201
+ "zzs": "[打-丁+審]",
126202
+ "unicode": "22E19",
126203
+ "unicode-char": "𢸙"
126204
+ },
126205
+ "CB32832": {
126206
+ "zzs": "[卄/(袖-由+任)]"
126207
+ },
126208
+ "CB32830": {
126209
+ "zzs": "[烈-列+毛]",
126210
+ "unicode": "241AC",
126211
+ "unicode-char": "𤆬"
126212
+ },
126213
+ "CB32983": {
126214
+ "zzs": "[雨/如]",
126215
+ "unicode": "290B0",
126216
+ "unicode-char": "𩂰"
126217
+ },
126218
+ "CB32984": {
126219
+ "zzs": "[貝*昜]",
126220
+ "normal": "賜",
126221
+ "unicode": "27DBD",
126222
+ "unicode-char": "𧶽"
126223
+ },
126224
+ "CB32985": {
126225
+ "zzs": "[怡-台+志]",
126226
+ "unicode": "2267A",
126227
+ "unicode-char": "𢙺"
126228
+ },
126229
+ "CB32986": {
126230
+ "zzs": "[棣-木+王]",
126231
+ "unicode": "3ED6",
126232
+ "unicode-char": "㻖"
126187
126233
  }
126188
126234
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-11 00:00:00.000000000 Z
11
+ date: 2015-09-24 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
14
14
  email: zhoubx@gmail.com