cbeta 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/cbeta.rb +1 -0
  3. data/lib/cbeta/p5a_to_text.rb +434 -0
  4. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4534f862bd8b4825c28655db9d68d47d0f42bfc0
4
- data.tar.gz: 9dec47ef7f7989cc7c4e1c8f821fd74d38e78c79
3
+ metadata.gz: 8fa52e9b0b8aedcc963fb3fe04e7671e11195136
4
+ data.tar.gz: ce0abdc6a26880da654608dd5e23bdb227e28e54
5
5
  SHA512:
6
- metadata.gz: dc320239449683133eb32a0a7c357fca63ddb39243ce800d5c8063396efcd59f7ff81cb51b163af2eea6dc7771ba2ac4c640ebdf2daf5e7b67716ebab2b478e5
7
- data.tar.gz: 583873fba9159f24d3cd006122c15e752a072aa58df3dd2f8c8d1e15e26a4a3b1ba4454f56cb8d91ebcde685ee7a37bbc05a768a96e1e92f426dcc193c7877ad
6
+ metadata.gz: e201a6601286381216794fd9cf704785a01339782813395f51331037028b57d498a3c58852e1ce5ffcd2597ae5eec356be86f10c55c3e08619cd230e78019bb4
7
+ data.tar.gz: b3487594d36f4e698f2bd61ce75a69c7d057975fa20e68683293782e580f43ede41afc4619822a49447aad5bac8f1a6d19d22f7e7e0c9d68b3f7f6f62d1b89e9
data/lib/cbeta.rb CHANGED
@@ -51,4 +51,5 @@ end
51
51
  require 'cbeta/gaiji'
52
52
  require 'cbeta/bm_to_text'
53
53
  require 'cbeta/p5a_to_html'
54
+ require 'cbeta/p5a_to_text'
54
55
  require 'cbeta/html_to_text'
@@ -0,0 +1,434 @@
1
+ require 'cgi'
2
+ require 'date'
3
+ require 'fileutils'
4
+ require 'json'
5
+ require 'nokogiri'
6
+ require 'set'
7
+
8
+ # Convert CBETA XML P5a to Text
9
+ #
10
+ # CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
11
+ class CBETA::P5aToText
12
+
13
+ # @param xml_root [String] 來源 CBETA XML P5a 路徑
14
+ # @param output_root [String] 輸出 Text 路徑
15
+ def initialize(xml_root, output_root)
16
+ @xml_root = xml_root
17
+ @output_root = output_root
18
+ @cbeta = CBETA.new
19
+ @gaijis = CBETA::Gaiji.new
20
+
21
+ # 載入 unicode 1.1 字集列表
22
+ fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
23
+ json = File.read(fn)
24
+ @unicode1 = JSON.parse(json)
25
+ end
26
+
27
+ # 將 CBETA XML P5a 轉為 Text
28
+ #
29
+ # @example for convert 大正藏第一冊:
30
+ #
31
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
32
+ # x2h.convert('T01')
33
+ #
34
+ # @example for convert 大正藏全部:
35
+ #
36
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
37
+ # x2h.convert('T')
38
+ #
39
+ # @example for convert 大正藏第五冊至第七冊:
40
+ #
41
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
42
+ # x2h.convert('T05..T07')
43
+ #
44
+ # T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
45
+ def convert(target=nil)
46
+ return convert_all if target.nil?
47
+
48
+ arg = target.upcase
49
+ if arg.size == 1
50
+ handle_collection(arg)
51
+ else
52
+ if arg.include? '..'
53
+ arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
54
+ handle_vols($1, $2)
55
+ }
56
+ else
57
+ handle_vol(arg)
58
+ end
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def convert_all
65
+ Dir.foreach(@xml_root) { |c|
66
+ next unless c.match(/^[A-Z]$/)
67
+ handle_collection(c)
68
+ }
69
+ end
70
+
71
+ def handle_anchor(e)
72
+ if e.has_attribute?('type')
73
+ if e['type'] == 'circle'
74
+ return '◎'
75
+ end
76
+ end
77
+
78
+ ''
79
+ end
80
+
81
+ def handle_app(e)
82
+ traverse(e)
83
+ end
84
+
85
+ def handle_byline(e)
86
+ traverse(e) + "\n"
87
+ end
88
+
89
+ def handle_cell(e)
90
+ traverse(e) + "\n"
91
+ end
92
+
93
+ def handle_collection(c)
94
+ @series = c
95
+ puts 'handle_collection ' + c
96
+ folder = File.join(@xml_root, @series)
97
+ Dir.foreach(folder) { |vol|
98
+ next if ['.', '..', '.DS_Store'].include? vol
99
+ handle_vol(vol)
100
+ }
101
+ end
102
+
103
+ def handle_corr(e)
104
+ "<r w='【CBETA】'>%s</r>" % traverse(e)
105
+ end
106
+
107
+ def handle_div(e)
108
+ traverse(e)
109
+ end
110
+
111
+ def handle_figure(e)
112
+ traverse(e) + "\n"
113
+ end
114
+
115
+ def handle_g(e)
116
+ # if 有 <mapping type="unicode">
117
+ # 直接採用
118
+ # else if 有 <mapping type="normal_unicode">
119
+ # 採用 normal_unicode
120
+ # else if 有 normalized form
121
+ # 採用 normalized form
122
+ # else
123
+ # Unicode PUA
124
+ gid = e['ref'][1..-1]
125
+ g = @gaijis[gid]
126
+ abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
127
+ zzs = g['zzs']
128
+
129
+ if gid.start_with?('SD')
130
+ case gid
131
+ when 'SD-E35A'
132
+ return '('
133
+ when 'SD-E35B'
134
+ return ')'
135
+ else
136
+ return g['roman']
137
+ end
138
+ end
139
+
140
+ return g['roman'] if gid.start_with?('RJ')
141
+ return g['unicode-char'] if g.has_key?('unicode')
142
+ return g['normal_unicode'] if g.has_key?('normal_unicode')
143
+ return g['normal'] if g.has_key?('normal')
144
+
145
+ # Unicode PUA
146
+ [0xf0000 + gid[2..-1].to_i].pack 'U'
147
+ end
148
+
149
+ def handle_graphic(e)
150
+ ''
151
+ end
152
+
153
+ def handle_head(e)
154
+ traverse(e) + "\n"
155
+ end
156
+
157
+ def handle_item(e)
158
+ traverse(e) + "\n"
159
+ end
160
+
161
+ def handle_juan(e)
162
+ traverse(e) + "\n"
163
+ end
164
+
165
+ def handle_l(e)
166
+ r = traverse(e)
167
+ unless @lg_type == 'abnormal'
168
+ r += "\n"
169
+ end
170
+ r
171
+ end
172
+
173
+ def handle_lb(e)
174
+ r = ''
175
+ unless @next_line_buf.empty?
176
+ r += @next_line_buf + "\n"
177
+ @next_line_buf = ''
178
+ end
179
+ r
180
+ end
181
+
182
+ def handle_lem(e)
183
+ r = ''
184
+ r = traverse(e)
185
+ w = e['wit'].scan(/【.*?】/)
186
+ @editions.merge w
187
+ w = w.join(' ')
188
+ "<r w='#{w}'>#{r}</r>"
189
+ end
190
+
191
+ def handle_lg(e)
192
+ traverse(e)
193
+ end
194
+
195
+ def handle_list(e)
196
+ "\n" + traverse(e)
197
+ end
198
+
199
+ def handle_milestone(e)
200
+ r = ''
201
+ if e['unit'] == 'juan'
202
+ @juan = e['n'].to_i
203
+ r += "<juan #{@juan}>"
204
+ end
205
+ r
206
+ end
207
+
208
+ def handle_mulu(e)
209
+ ''
210
+ end
211
+
212
+ def handle_node(e)
213
+ return '' if e.comment?
214
+ return handle_text(e) if e.text?
215
+ return '' if PASS.include?(e.name)
216
+ r = case e.name
217
+ when 'anchor' then handle_anchor(e)
218
+ when 'app' then handle_app(e)
219
+ when 'back' then ''
220
+ when 'byline' then handle_byline(e)
221
+ when 'cell' then handle_cell(e)
222
+ when 'corr' then handle_corr(e)
223
+ when 'div' then handle_div(e)
224
+ when 'figure' then handle_figure(e)
225
+ when 'foreign' then ''
226
+ when 'g' then handle_g(e)
227
+ when 'graphic' then handle_graphic(e)
228
+ when 'head' then handle_head(e)
229
+ when 'item' then handle_item(e)
230
+ when 'juan' then handle_juan(e)
231
+ when 'l' then handle_l(e)
232
+ when 'lb' then handle_lb(e)
233
+ when 'lem' then handle_lem(e)
234
+ when 'lg' then handle_lg(e)
235
+ when 'list' then handle_list(e)
236
+ when 'mulu' then handle_mulu(e)
237
+ when 'note' then handle_note(e)
238
+ when 'milestone' then handle_milestone(e)
239
+ when 'p' then handle_p(e)
240
+ when 'rdg' then handle_rdg(e)
241
+ when 'reg' then ''
242
+ when 'row' then handle_row(e)
243
+ when 'sic' then handle_sic(e)
244
+ when 'sg' then handle_sg(e)
245
+ when 't' then handle_t(e)
246
+ when 'table' then handle_table(e)
247
+ when 'teiHeader' then ''
248
+ else traverse(e)
249
+ end
250
+ r
251
+ end
252
+
253
+ def handle_note(e)
254
+ if e.has_attribute?('place') && e['place']=='inline'
255
+ r = traverse(e)
256
+ return "(#{r})"
257
+ end
258
+ ''
259
+ end
260
+
261
+ def handle_p(e)
262
+ traverse(e) + "\n"
263
+ end
264
+
265
+ def handle_rdg(e)
266
+ r = traverse(e)
267
+ w = e['wit'].scan(/【.*?】/)
268
+ @editions.merge w
269
+ "<r w='#{e['wit']}'>#{r}</r>"
270
+ end
271
+
272
+ def handle_row(e)
273
+ traverse(e)
274
+ end
275
+
276
+ def handle_sg(e)
277
+ '(' + traverse(e) + ')'
278
+ end
279
+
280
+ def handle_sic(e)
281
+ "<r w='#{@orig}'>" + traverse(e) + "</r>"
282
+ end
283
+
284
+ def handle_sutra(xml_fn)
285
+ puts "convert sutra #{xml_fn}"
286
+ @dila_note = 0
287
+ @div_count = 0
288
+ @editions = Set.new ["【CBETA】"]
289
+ @in_l = false
290
+ @juan = 0
291
+ @lg_row_open = false
292
+ @mod_notes = Set.new
293
+ @next_line_buf = ''
294
+ @open_divs = []
295
+ @sutra_no = File.basename(xml_fn, ".xml")
296
+
297
+ text = parse_xml(xml_fn)
298
+
299
+ # 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c....
300
+ # 輸出時去掉這些檔尾的 a, b, b....
301
+ if @sutra_no.match(/^(T05|T06|T07)n0220/)
302
+ @sutra_no = "#{$1}n0220"
303
+ end
304
+
305
+ @out_sutra = File.join(@out_vol, @sutra_no)
306
+ FileUtils.makedirs @out_sutra
307
+
308
+ juans = text.split(/(<juan \d+>)/)
309
+ open = false
310
+ fo = nil
311
+ juan_no = nil
312
+ fn = ''
313
+ buf = ''
314
+ # 一卷一檔
315
+ juans.each { |j|
316
+ if j =~ /<juan (\d+)>$/
317
+ juan_no = $1.to_i
318
+ else
319
+ if juan_no.nil?
320
+ buf = j
321
+ else
322
+ write_juan(juan_no, buf+j)
323
+ buf = ''
324
+ end
325
+ end
326
+ }
327
+ end
328
+
329
+ def handle_t(e)
330
+ if e.has_attribute? 'place'
331
+ return '' if e['place'].include? 'foot'
332
+ end
333
+ r = traverse(e)
334
+
335
+ # 處理雙行對照
336
+ i = e.xpath('../t').index(e)
337
+ case i
338
+ when 0
339
+ return r + ' '
340
+ when 1
341
+ @next_line_buf += r + ' '
342
+ return ''
343
+ else
344
+ return r
345
+ end
346
+ end
347
+
348
+ def handle_table(e)
349
+ traverse(e)
350
+ end
351
+
352
+ def handle_text(e)
353
+ s = e.content().chomp
354
+ return '' if s.empty?
355
+ return '' if e.parent.name == 'app'
356
+
357
+ # cbeta xml 文字之間會有多餘的換行
358
+ r = s.gsub(/[\n\r]/, '')
359
+
360
+ # 把 & 轉為 &amp;
361
+ CGI.escapeHTML(r)
362
+ end
363
+
364
+ def handle_vol(vol)
365
+ puts "convert volumn: #{vol}"
366
+
367
+ @orig = @cbeta.get_canon_abbr(vol[0])
368
+ abort "未處理底本" if @orig.nil?
369
+
370
+ @vol = vol
371
+ @series = vol[0]
372
+ @out_vol = File.join(@output_root, @series, vol)
373
+ FileUtils.remove_dir(@out_vol, force=true)
374
+ FileUtils.makedirs @out_vol
375
+
376
+ source = File.join(@xml_root, @series, vol)
377
+ Dir[source+"/*"].each { |f|
378
+ handle_sutra(f)
379
+ }
380
+ end
381
+
382
+ def handle_vols(v1, v2)
383
+ puts "convert volumns: #{v1}..#{v2}"
384
+ @series = v1[0]
385
+ folder = File.join(@xml_root, @series)
386
+ Dir.foreach(folder) { |vol|
387
+ next if vol < v1
388
+ next if vol > v2
389
+ handle_vol(vol)
390
+ }
391
+ end
392
+
393
+ def open_xml(fn)
394
+ s = File.read(fn)
395
+ doc = Nokogiri::XML(s)
396
+ doc.remove_namespaces!()
397
+ doc
398
+ end
399
+
400
+ def parse_xml(xml_fn)
401
+ doc = open_xml(xml_fn)
402
+ root = doc.root()
403
+
404
+ body = root.xpath("text/body")[0]
405
+ traverse(body)
406
+ end
407
+
408
+ def traverse(e)
409
+ r = ''
410
+ e.children.each { |c|
411
+ s = handle_node(c)
412
+ r += s
413
+ }
414
+ r
415
+ end
416
+
417
+ def write_juan(juan_no, txt)
418
+ @editions.each do |ed|
419
+ frag = Nokogiri::XML.fragment(txt)
420
+ frag.search("r").each do |node|
421
+ if node['w'] != ed
422
+ node.remove
423
+ end
424
+ end
425
+
426
+ folder = File.join(@out_sutra, ed)
427
+ FileUtils.makedirs(folder)
428
+
429
+ fn = "#{@sutra_no}_%03d.txt" % juan_no
430
+ output_path = File.join(folder, fn)
431
+ File.write(output_path, frag.content)
432
+ end
433
+ end
434
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-11 00:00:00.000000000 Z
11
+ date: 2015-06-23 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
14
14
  email: zhoubx@gmail.com
@@ -23,6 +23,7 @@ files:
23
23
  - lib/cbeta/gaiji.rb
24
24
  - lib/cbeta/html_to_text.rb
25
25
  - lib/cbeta/p5a_to_html.rb
26
+ - lib/cbeta/p5a_to_text.rb
26
27
  - lib/cbeta/unicode-1.1.json
27
28
  homepage: https://github.com/RayCHOU/ruby-cbeta
28
29
  licenses: