cbeta 0.6.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 20467d7b166bf2c6daf7ec2a174472ba7cbd6a56
4
- data.tar.gz: efcbf9c7d8c3cf5d71b371d0ef0745c6d3988466
3
+ metadata.gz: e7b3afd9fe14f3c71a3f519aa0a15dc5b2e198ab
4
+ data.tar.gz: 898fddd049d4f076edb446c6f2751a10e41e248e
5
5
  SHA512:
6
- metadata.gz: 41b01556ca22270c458d806a0b45833d4eb64b0c38d64110a39232e09dc925d3c5081fd382f75da4857d3c05177250541a03c22d5236ea1c799ee73a33fcdec7
7
- data.tar.gz: b4081c2d5e1f68a26072eba3409bd12372c28765487c8e12217a7e9c7389a1bf002e42af63c10a267e4dde7a87c4537654a5b146506d50263ec09b6a6842ddc0
6
+ metadata.gz: bd71006daf93b24dc46bd1afe004d1726cc6ef0b6de6159357ec01a3c255e732642f551037ffc8dc05be7e2997599bbb2ca015dcbc05bc497c029c903a0fca3c
7
+ data.tar.gz: f860ffec038a6924de638861a46e94c8bcb325a8114fb00aad19f898036c2722657229e4b25f83ac8b19143f65b100212041f4e51a95c3a208f632e163f7c276
data/lib/cbeta.rb CHANGED
@@ -52,11 +52,25 @@ class CBETA
52
52
  #
53
53
  # @example
54
54
  # cbeta = CBETA.new
55
- # cbeta.get_canon_abbr('T') # return "【大】"
56
- def get_canon_abbr(id)
55
+ # cbeta.get_canon_symbol('T') # return "【大】"
56
+ def get_canon_symbol(id)
57
57
  return nil unless @canon_abbr.key? id
58
58
  @canon_abbr[id]
59
59
  end
60
+
61
+ # 取得藏經略名
62
+ #
63
+ # @param id [String] 藏經 ID, 例如大正藏的 ID 是 "T"
64
+ # @return [String] 藏經短名,例如 "大"
65
+ #
66
+ # @example
67
+ # cbeta = CBETA.new
68
+ # cbeta.get_canon_abbr('T') # return "大"
69
+ def get_canon_abbr(id)
70
+ r = get_canon_symbol(id)
71
+ return nil if r.nil?
72
+ r.sub(/^【(.*?)】$/, '\1')
73
+ end
60
74
  end
61
75
 
62
76
  require 'cbeta/gaiji'
data/lib/cbeta/gaiji.rb CHANGED
@@ -37,10 +37,65 @@ class CBETA::Gaiji
37
37
  # @return [Array<String>]
38
38
  #
39
39
  # @example
40
- # g = Cbeta::Gaiji.new
40
+ # g = CBETA::Gaiji.new
41
41
  # g.zhuyin("CB00023") # return [ "ㄍㄢˇ", "ㄍㄢ", "ㄧㄤˊ", "ㄇㄧˇ", "ㄇㄧㄝ", "ㄒㄧㄤˊ" ]
42
42
  def zhuyin(cb)
43
43
  return nil unless @gaijis.key? cb
44
44
  @gaijis[cb]['zhuyin']
45
45
  end
46
+
47
+ # 讀 XML P5 檔頭的缺字資料,更新現有缺字資料,輸出 JSON
48
+ def update_from_p5(p5_folder, output_json_filename)
49
+ update_from_p5_folder(p5_folder)
50
+ s = JSON.pretty_generate(@gaijis)
51
+ File.write(output_json_filename, s)
52
+ end
53
+
54
+ private
55
+ def char_to_hash(char)
56
+ r = {}
57
+ id = char['id']
58
+ char.xpath('charProp').each do |e|
59
+ prop = e.at('localName').text
60
+ case prop
61
+ when 'composition'
62
+ r['zzs'] = e.at('value').text
63
+ when 'normalized form'
64
+ r['normal'] = e.at('value').text
65
+ else
66
+ puts "未處理 charProp/localName: #{prop}"
67
+ end
68
+ end
69
+ char.xpath('mapping').each do |e|
70
+ case e['type']
71
+ when 'unicode'
72
+ u = e.text[2..-1]
73
+ r['unicode'] = u
74
+ r['unicode-char'] = [u.hex].pack('U')
75
+ end
76
+ end
77
+ r
78
+ end
79
+
80
+ def update_from_p5_file(fn)
81
+ f = File.open(fn)
82
+ doc = Nokogiri::XML(f)
83
+ f.close
84
+ doc.remove_namespaces!()
85
+ doc.xpath("//charDecl/char").each do |char|
86
+ @gaijis[char['id']] = char_to_hash(char)
87
+ end
88
+ end
89
+
90
+ def update_from_p5_folder(folder)
91
+ Dir.entries(folder).each do |f|
92
+ path = File.join(folder, f)
93
+ next if f.start_with? '.'
94
+ if Dir.exist? path
95
+ update_from_p5_folder path
96
+ else
97
+ update_from_p5_file path
98
+ end
99
+ end
100
+ end
46
101
  end
@@ -323,14 +323,6 @@ eos
323
323
  abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
324
324
  zzs = g['zzs']
325
325
 
326
- if mode == 'txt'
327
- return g['roman'] if gid.start_with?('SD')
328
- if zzs.nil?
329
- abort "缺組字式:#{g}"
330
- else
331
- return zzs
332
- end
333
- end
334
326
 
335
327
  if gid.start_with?('SD')
336
328
  case gid
@@ -339,14 +331,43 @@ eos
339
331
  when 'SD-E35B'
340
332
  return ')'
341
333
  else
342
- return g['roman']
334
+ return g['roman'] if g.key? 'roman'
335
+
336
+ if mode == 'txt'
337
+ puts "警告:純文字模式出現悉曇字:#{gid}"
338
+ return gid
339
+ else
340
+ # 如果沒有羅馬轉寫就顯示圖檔
341
+ src = File.join(@settings[:graphic_base], 'sd-gif', gid[3..4], gid+'.gif')
342
+ basename = File.basename(src)
343
+ dest = File.join(@temp_folder, 'img', basename)
344
+ FileUtils.copy(src, dest)
345
+ return "<img src='../img/#{basename}' />"
346
+ end
343
347
  end
344
348
  end
345
349
 
346
350
  if gid.start_with?('RJ')
347
- return g['roman']
351
+ return g['roman'] if g.key? 'roman'
352
+
353
+ if mode == 'txt'
354
+ puts "警告:純文字模式出現蘭札體:#{gid}"
355
+ return gid
356
+ else
357
+ # 如果沒有羅馬轉寫就顯示圖檔
358
+ src = File.join(@settings[:graphic_base], 'rj-gif', gid[3..4], gid+'.gif')
359
+ basename = File.basename(src)
360
+ dest = File.join(@temp_folder, 'img', basename)
361
+ FileUtils.copy(src, dest)
362
+ return "<img src='../img/#{basename}' />"
363
+ end
348
364
  end
349
-
365
+
366
+ if mode == 'txt'
367
+ abort "缺組字式:#{g}" if zzs.nil?
368
+ return zzs
369
+ end
370
+
350
371
  default = ''
351
372
  if g.has_key?('unicode')
352
373
  if @unicode1.include?(g['unicode'])
@@ -359,7 +380,7 @@ eos
359
380
 
360
381
  def handle_graphic(e)
361
382
  url = e['url']
362
- url.sub!(/^.*figures\/(.*)$/, '\1')
383
+ url.sub!(/^.*(figures\/.*)$/, '\1')
363
384
 
364
385
  src = File.join(@settings[:graphic_base], url)
365
386
  basename = File.basename(src)
@@ -516,6 +537,7 @@ eos
516
537
  return '' if e.comment?
517
538
  return handle_text(e, mode) if e.text?
518
539
  return '' if PASS.include?(e.name)
540
+
519
541
  r = case e.name
520
542
  when 'anchor' then handle_anchor(e)
521
543
  when 'app' then handle_app(e)
@@ -15,6 +15,10 @@ require 'set'
15
15
  # c.convert('T01')
16
16
  #
17
17
  class CBETA::P5aToText
18
+ # 內容不輸出的元素
19
+ PASS=['back', 'teiHeader']
20
+
21
+ private_constant :PASS
18
22
 
19
23
  # @param xml_root [String] 來源 CBETA XML P5a 路徑
20
24
  # @param output_root [String] 輸出 Text 路徑
@@ -444,7 +448,7 @@ class CBETA::P5aToText
444
448
  def handle_vol(vol)
445
449
  puts "convert volumn: #{vol}"
446
450
 
447
- @orig = @cbeta.get_canon_abbr(vol[0])
451
+ @orig = @cbeta.get_canon_symbol(vol[0])
448
452
  abort "未處理底本" if @orig.nil?
449
453
 
450
454
  @vol = vol
@@ -508,7 +512,11 @@ class CBETA::P5aToText
508
512
  text = frag.content
509
513
  text = appify(text) if @format == 'app'
510
514
 
511
- fn = "#{ed}.txt"
515
+ if ed == @orig
516
+ fn = "#{ed}-orig.txt"
517
+ else
518
+ fn = "#{ed}.txt"
519
+ end
512
520
  output_path = File.join(folder, fn)
513
521
  File.write(output_path, text)
514
522
  end
data/lib/data/gaiji.json CHANGED
@@ -50,10 +50,7 @@
50
50
  "CB00178": {
51
51
  "zzs": "[木*奈]",
52
52
  "unicode": "3B88",
53
- "unicode-char": "㮈",
54
- "zhuyin": [
55
- "ㄋㄞˋ"
56
- ]
53
+ "unicode-char": "㮈"
57
54
  },
58
55
  "CB00238": {
59
56
  "zzs": "[打-丁+毛]",
@@ -107,10 +104,7 @@
107
104
  "zzs": "[馬*犬]",
108
105
  "normal": "馱",
109
106
  "unicode": "4B7E",
110
- "unicode-char": "䭾",
111
- "zhuyin": [
112
- "ㄊㄨㄛˊ"
113
- ]
107
+ "unicode-char": "䭾"
114
108
  },
115
109
  "CB00509": {
116
110
  "zzs": "[商/衣]",
@@ -2933,8 +2927,7 @@
2933
2927
  },
2934
2928
  "CB04775": {
2935
2929
  "zzs": "[柷-口+登]",
2936
- "normal": "凳",
2937
- "normal_unicode": "櫈"
2930
+ "normal": "凳"
2938
2931
  },
2939
2932
  "CB00144": {
2940
2933
  "zzs": "[少/免]",
@@ -3403,10 +3396,7 @@
3403
3396
  "CB05105": {
3404
3397
  "zzs": "[契-大+石]",
3405
3398
  "unicode": "40AE",
3406
- "unicode-char": "䂮",
3407
- "zhuyin": [
3408
- "ㄌㄩㄝˋ"
3409
- ]
3399
+ "unicode-char": "䂮"
3410
3400
  },
3411
3401
  "SD-CFC5": {
3412
3402
  "sd-char": "狣",
@@ -42271,10 +42261,7 @@
42271
42261
  "CB06535": {
42272
42262
  "zzs": "[自/本]",
42273
42263
  "unicode": "2690E",
42274
- "unicode-char": "𦤎",
42275
- "zhuyin": [
42276
- "ㄍㄠ"
42277
- ]
42264
+ "unicode-char": "𦤎"
42278
42265
  },
42279
42266
  "CB06952": {
42280
42267
  "zzs": "[狂-王+羊]",
@@ -45192,11 +45179,7 @@
45192
45179
  "CB05711": {
45193
45180
  "zzs": "[颱-台+日]",
45194
45181
  "unicode": "4AFB",
45195
- "unicode-char": "䫻",
45196
- "zhuyin": [
45197
- "ㄒㄩㄝˊ",
45198
- "ㄩˋ"
45199
- ]
45182
+ "unicode-char": "䫻"
45200
45183
  },
45201
45184
  "CB15400": {
45202
45185
  "zzs": "[身*國]",
@@ -49191,11 +49174,7 @@
49191
49174
  "zzs": "[仁-二+嶲]",
49192
49175
  "normal": "俊",
49193
49176
  "unicode": "349E",
49194
- "unicode-char": "㒞",
49195
- "zhuyin": [
49196
- "ㄐㄩㄣˋ",
49197
- "ㄎㄜˇ"
49198
- ]
49177
+ "unicode-char": "㒞"
49199
49178
  },
49200
49179
  "CB13910": {
49201
49180
  "zzs": "[打-丁+閵]"
@@ -126184,5 +126163,72 @@
126184
126163
  "ㄅㄧㄣˇ",
126185
126164
  "ㄌㄧㄣˇ"
126186
126165
  ]
126166
+ },
126167
+ "CB32783": {
126168
+ "zzs": "[(糸*子)/心]"
126169
+ },
126170
+ "CB32784": {
126171
+ "zzs": "[受-又+(撤-育)]"
126172
+ },
126173
+ "CB32785": {
126174
+ "zzs": "[烈-列+((白/匕)*旡)]"
126175
+ },
126176
+ "CB32781": {
126177
+ "zzs": "[虫*雷]",
126178
+ "unicode": "274BD",
126179
+ "unicode-char": "𧒽"
126180
+ },
126181
+ "CB32792": {
126182
+ "zzs": "[番*韭]",
126183
+ "unicode": "2940F",
126184
+ "unicode-char": "𩐏"
126185
+ },
126186
+ "CB32793": {
126187
+ "zzs": "[米*(產-文+(立-一))]"
126188
+ },
126189
+ "CB32821": {
126190
+ "zzs": "[入/耳]",
126191
+ "normal": "聞",
126192
+ "unicode": "26535",
126193
+ "unicode-char": "𦔵"
126194
+ },
126195
+ "CB32825": {
126196
+ "zzs": "[舟*定]",
126197
+ "unicode": "26A58",
126198
+ "unicode-char": "𦩘"
126199
+ },
126200
+ "CB32828": {
126201
+ "zzs": "[打-丁+審]",
126202
+ "unicode": "22E19",
126203
+ "unicode-char": "𢸙"
126204
+ },
126205
+ "CB32832": {
126206
+ "zzs": "[卄/(袖-由+任)]"
126207
+ },
126208
+ "CB32830": {
126209
+ "zzs": "[烈-列+毛]",
126210
+ "unicode": "241AC",
126211
+ "unicode-char": "𤆬"
126212
+ },
126213
+ "CB32983": {
126214
+ "zzs": "[雨/如]",
126215
+ "unicode": "290B0",
126216
+ "unicode-char": "𩂰"
126217
+ },
126218
+ "CB32984": {
126219
+ "zzs": "[貝*昜]",
126220
+ "normal": "賜",
126221
+ "unicode": "27DBD",
126222
+ "unicode-char": "𧶽"
126223
+ },
126224
+ "CB32985": {
126225
+ "zzs": "[怡-台+志]",
126226
+ "unicode": "2267A",
126227
+ "unicode-char": "𢙺"
126228
+ },
126229
+ "CB32986": {
126230
+ "zzs": "[棣-木+王]",
126231
+ "unicode": "3ED6",
126232
+ "unicode-char": "㻖"
126187
126233
  }
126188
126234
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-11 00:00:00.000000000 Z
11
+ date: 2015-09-24 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
14
14
  email: zhoubx@gmail.com