cbeta 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/cbeta.rb +1 -0
  3. data/lib/cbeta/p5a_to_text.rb +434 -0
  4. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4534f862bd8b4825c28655db9d68d47d0f42bfc0
4
- data.tar.gz: 9dec47ef7f7989cc7c4e1c8f821fd74d38e78c79
3
+ metadata.gz: 8fa52e9b0b8aedcc963fb3fe04e7671e11195136
4
+ data.tar.gz: ce0abdc6a26880da654608dd5e23bdb227e28e54
5
5
  SHA512:
6
- metadata.gz: dc320239449683133eb32a0a7c357fca63ddb39243ce800d5c8063396efcd59f7ff81cb51b163af2eea6dc7771ba2ac4c640ebdf2daf5e7b67716ebab2b478e5
7
- data.tar.gz: 583873fba9159f24d3cd006122c15e752a072aa58df3dd2f8c8d1e15e26a4a3b1ba4454f56cb8d91ebcde685ee7a37bbc05a768a96e1e92f426dcc193c7877ad
6
+ metadata.gz: e201a6601286381216794fd9cf704785a01339782813395f51331037028b57d498a3c58852e1ce5ffcd2597ae5eec356be86f10c55c3e08619cd230e78019bb4
7
+ data.tar.gz: b3487594d36f4e698f2bd61ce75a69c7d057975fa20e68683293782e580f43ede41afc4619822a49447aad5bac8f1a6d19d22f7e7e0c9d68b3f7f6f62d1b89e9
data/lib/cbeta.rb CHANGED
@@ -51,4 +51,5 @@ end
51
51
  require 'cbeta/gaiji'
52
52
  require 'cbeta/bm_to_text'
53
53
  require 'cbeta/p5a_to_html'
54
+ require 'cbeta/p5a_to_text'
54
55
  require 'cbeta/html_to_text'
@@ -0,0 +1,434 @@
1
+ require 'cgi'
2
+ require 'date'
3
+ require 'fileutils'
4
+ require 'json'
5
+ require 'nokogiri'
6
+ require 'set'
7
+
8
+ # Convert CBETA XML P5a to Text
9
+ #
10
+ # CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
11
+ class CBETA::P5aToText
12
+
13
+ # @param xml_root [String] 來源 CBETA XML P5a 路徑
14
+ # @param output_root [String] 輸出 Text 路徑
15
+ def initialize(xml_root, output_root)
16
+ @xml_root = xml_root
17
+ @output_root = output_root
18
+ @cbeta = CBETA.new
19
+ @gaijis = CBETA::Gaiji.new
20
+
21
+ # 載入 unicode 1.1 字集列表
22
+ fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
23
+ json = File.read(fn)
24
+ @unicode1 = JSON.parse(json)
25
+ end
26
+
27
+ # 將 CBETA XML P5a 轉為 Text
28
+ #
29
+ # @example for convert 大正藏第一冊:
30
+ #
31
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
32
+ # x2h.convert('T01')
33
+ #
34
+ # @example for convert 大正藏全部:
35
+ #
36
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
37
+ # x2h.convert('T')
38
+ #
39
+ # @example for convert 大正藏第五冊至第七冊:
40
+ #
41
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
42
+ # x2h.convert('T05..T07')
43
+ #
44
+ # T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
45
+ def convert(target=nil)
46
+ return convert_all if target.nil?
47
+
48
+ arg = target.upcase
49
+ if arg.size == 1
50
+ handle_collection(arg)
51
+ else
52
+ if arg.include? '..'
53
+ arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
54
+ handle_vols($1, $2)
55
+ }
56
+ else
57
+ handle_vol(arg)
58
+ end
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def convert_all
65
+ Dir.foreach(@xml_root) { |c|
66
+ next unless c.match(/^[A-Z]$/)
67
+ handle_collection(c)
68
+ }
69
+ end
70
+
71
+ def handle_anchor(e)
72
+ if e.has_attribute?('type')
73
+ if e['type'] == 'circle'
74
+ return '◎'
75
+ end
76
+ end
77
+
78
+ ''
79
+ end
80
+
81
+ def handle_app(e)
82
+ traverse(e)
83
+ end
84
+
85
+ def handle_byline(e)
86
+ traverse(e) + "\n"
87
+ end
88
+
89
+ def handle_cell(e)
90
+ traverse(e) + "\n"
91
+ end
92
+
93
+ def handle_collection(c)
94
+ @series = c
95
+ puts 'handle_collection ' + c
96
+ folder = File.join(@xml_root, @series)
97
+ Dir.foreach(folder) { |vol|
98
+ next if ['.', '..', '.DS_Store'].include? vol
99
+ handle_vol(vol)
100
+ }
101
+ end
102
+
103
+ def handle_corr(e)
104
+ "<r w='【CBETA】'>%s</r>" % traverse(e)
105
+ end
106
+
107
+ def handle_div(e)
108
+ traverse(e)
109
+ end
110
+
111
+ def handle_figure(e)
112
+ traverse(e) + "\n"
113
+ end
114
+
115
+ def handle_g(e)
116
+ # if 有 <mapping type="unicode">
117
+ # 直接採用
118
+ # else if 有 <mapping type="normal_unicode">
119
+ # 採用 normal_unicode
120
+ # else if 有 normalized form
121
+ # 採用 normalized form
122
+ # else
123
+ # Unicode PUA
124
+ gid = e['ref'][1..-1]
125
+ g = @gaijis[gid]
126
+ abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
127
+ zzs = g['zzs']
128
+
129
+ if gid.start_with?('SD')
130
+ case gid
131
+ when 'SD-E35A'
132
+ return '('
133
+ when 'SD-E35B'
134
+ return ')'
135
+ else
136
+ return g['roman']
137
+ end
138
+ end
139
+
140
+ return g['roman'] if gid.start_with?('RJ')
141
+ return g['unicode-char'] if g.has_key?('unicode')
142
+ return g['normal_unicode'] if g.has_key?('normal_unicode')
143
+ return g['normal'] if g.has_key?('normal')
144
+
145
+ # Unicode PUA
146
+ [0xf0000 + gid[2..-1].to_i].pack 'U'
147
+ end
148
+
149
+ def handle_graphic(e)
150
+ ''
151
+ end
152
+
153
+ def handle_head(e)
154
+ traverse(e) + "\n"
155
+ end
156
+
157
+ def handle_item(e)
158
+ traverse(e) + "\n"
159
+ end
160
+
161
+ def handle_juan(e)
162
+ traverse(e) + "\n"
163
+ end
164
+
165
+ def handle_l(e)
166
+ r = traverse(e)
167
+ unless @lg_type == 'abnormal'
168
+ r += "\n"
169
+ end
170
+ r
171
+ end
172
+
173
+ def handle_lb(e)
174
+ r = ''
175
+ unless @next_line_buf.empty?
176
+ r += @next_line_buf + "\n"
177
+ @next_line_buf = ''
178
+ end
179
+ r
180
+ end
181
+
182
+ def handle_lem(e)
183
+ r = ''
184
+ r = traverse(e)
185
+ w = e['wit'].scan(/【.*?】/)
186
+ @editions.merge w
187
+ w = w.join(' ')
188
+ "<r w='#{w}'>#{r}</r>"
189
+ end
190
+
191
+ def handle_lg(e)
192
+ traverse(e)
193
+ end
194
+
195
+ def handle_list(e)
196
+ "\n" + traverse(e)
197
+ end
198
+
199
+ def handle_milestone(e)
200
+ r = ''
201
+ if e['unit'] == 'juan'
202
+ @juan = e['n'].to_i
203
+ r += "<juan #{@juan}>"
204
+ end
205
+ r
206
+ end
207
+
208
+ def handle_mulu(e)
209
+ ''
210
+ end
211
+
212
+ def handle_node(e)
213
+ return '' if e.comment?
214
+ return handle_text(e) if e.text?
215
+ return '' if PASS.include?(e.name)
216
+ r = case e.name
217
+ when 'anchor' then handle_anchor(e)
218
+ when 'app' then handle_app(e)
219
+ when 'back' then ''
220
+ when 'byline' then handle_byline(e)
221
+ when 'cell' then handle_cell(e)
222
+ when 'corr' then handle_corr(e)
223
+ when 'div' then handle_div(e)
224
+ when 'figure' then handle_figure(e)
225
+ when 'foreign' then ''
226
+ when 'g' then handle_g(e)
227
+ when 'graphic' then handle_graphic(e)
228
+ when 'head' then handle_head(e)
229
+ when 'item' then handle_item(e)
230
+ when 'juan' then handle_juan(e)
231
+ when 'l' then handle_l(e)
232
+ when 'lb' then handle_lb(e)
233
+ when 'lem' then handle_lem(e)
234
+ when 'lg' then handle_lg(e)
235
+ when 'list' then handle_list(e)
236
+ when 'mulu' then handle_mulu(e)
237
+ when 'note' then handle_note(e)
238
+ when 'milestone' then handle_milestone(e)
239
+ when 'p' then handle_p(e)
240
+ when 'rdg' then handle_rdg(e)
241
+ when 'reg' then ''
242
+ when 'row' then handle_row(e)
243
+ when 'sic' then handle_sic(e)
244
+ when 'sg' then handle_sg(e)
245
+ when 't' then handle_t(e)
246
+ when 'table' then handle_table(e)
247
+ when 'teiHeader' then ''
248
+ else traverse(e)
249
+ end
250
+ r
251
+ end
252
+
253
+ def handle_note(e)
254
+ if e.has_attribute?('place') && e['place']=='inline'
255
+ r = traverse(e)
256
+ return "(#{r})"
257
+ end
258
+ ''
259
+ end
260
+
261
+ def handle_p(e)
262
+ traverse(e) + "\n"
263
+ end
264
+
265
+ def handle_rdg(e)
266
+ r = traverse(e)
267
+ w = e['wit'].scan(/【.*?】/)
268
+ @editions.merge w
269
+ "<r w='#{e['wit']}'>#{r}</r>"
270
+ end
271
+
272
+ def handle_row(e)
273
+ traverse(e)
274
+ end
275
+
276
+ def handle_sg(e)
277
+ '(' + traverse(e) + ')'
278
+ end
279
+
280
+ def handle_sic(e)
281
+ "<r w='#{@orig}'>" + traverse(e) + "</r>"
282
+ end
283
+
284
+ def handle_sutra(xml_fn)
285
+ puts "convert sutra #{xml_fn}"
286
+ @dila_note = 0
287
+ @div_count = 0
288
+ @editions = Set.new ["【CBETA】"]
289
+ @in_l = false
290
+ @juan = 0
291
+ @lg_row_open = false
292
+ @mod_notes = Set.new
293
+ @next_line_buf = ''
294
+ @open_divs = []
295
+ @sutra_no = File.basename(xml_fn, ".xml")
296
+
297
+ text = parse_xml(xml_fn)
298
+
299
+ # 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c....
300
+ # 輸出時去掉這些檔尾的 a, b, b....
301
+ if @sutra_no.match(/^(T05|T06|T07)n0220/)
302
+ @sutra_no = "#{$1}n0220"
303
+ end
304
+
305
+ @out_sutra = File.join(@out_vol, @sutra_no)
306
+ FileUtils.makedirs @out_sutra
307
+
308
+ juans = text.split(/(<juan \d+>)/)
309
+ open = false
310
+ fo = nil
311
+ juan_no = nil
312
+ fn = ''
313
+ buf = ''
314
+ # 一卷一檔
315
+ juans.each { |j|
316
+ if j =~ /<juan (\d+)>$/
317
+ juan_no = $1.to_i
318
+ else
319
+ if juan_no.nil?
320
+ buf = j
321
+ else
322
+ write_juan(juan_no, buf+j)
323
+ buf = ''
324
+ end
325
+ end
326
+ }
327
+ end
328
+
329
+ def handle_t(e)
330
+ if e.has_attribute? 'place'
331
+ return '' if e['place'].include? 'foot'
332
+ end
333
+ r = traverse(e)
334
+
335
+ # 處理雙行對照
336
+ i = e.xpath('../t').index(e)
337
+ case i
338
+ when 0
339
+ return r + ' '
340
+ when 1
341
+ @next_line_buf += r + ' '
342
+ return ''
343
+ else
344
+ return r
345
+ end
346
+ end
347
+
348
+ def handle_table(e)
349
+ traverse(e)
350
+ end
351
+
352
+ def handle_text(e)
353
+ s = e.content().chomp
354
+ return '' if s.empty?
355
+ return '' if e.parent.name == 'app'
356
+
357
+ # cbeta xml 文字之間會有多餘的換行
358
+ r = s.gsub(/[\n\r]/, '')
359
+
360
+ # 把 & 轉為 &amp;
361
+ CGI.escapeHTML(r)
362
+ end
363
+
364
+ def handle_vol(vol)
365
+ puts "convert volumn: #{vol}"
366
+
367
+ @orig = @cbeta.get_canon_abbr(vol[0])
368
+ abort "未處理底本" if @orig.nil?
369
+
370
+ @vol = vol
371
+ @series = vol[0]
372
+ @out_vol = File.join(@output_root, @series, vol)
373
+ FileUtils.remove_dir(@out_vol, force=true)
374
+ FileUtils.makedirs @out_vol
375
+
376
+ source = File.join(@xml_root, @series, vol)
377
+ Dir[source+"/*"].each { |f|
378
+ handle_sutra(f)
379
+ }
380
+ end
381
+
382
+ def handle_vols(v1, v2)
383
+ puts "convert volumns: #{v1}..#{v2}"
384
+ @series = v1[0]
385
+ folder = File.join(@xml_root, @series)
386
+ Dir.foreach(folder) { |vol|
387
+ next if vol < v1
388
+ next if vol > v2
389
+ handle_vol(vol)
390
+ }
391
+ end
392
+
393
+ def open_xml(fn)
394
+ s = File.read(fn)
395
+ doc = Nokogiri::XML(s)
396
+ doc.remove_namespaces!()
397
+ doc
398
+ end
399
+
400
+ def parse_xml(xml_fn)
401
+ doc = open_xml(xml_fn)
402
+ root = doc.root()
403
+
404
+ body = root.xpath("text/body")[0]
405
+ traverse(body)
406
+ end
407
+
408
+ def traverse(e)
409
+ r = ''
410
+ e.children.each { |c|
411
+ s = handle_node(c)
412
+ r += s
413
+ }
414
+ r
415
+ end
416
+
417
+ def write_juan(juan_no, txt)
418
+ @editions.each do |ed|
419
+ frag = Nokogiri::XML.fragment(txt)
420
+ frag.search("r").each do |node|
421
+ if node['w'] != ed
422
+ node.remove
423
+ end
424
+ end
425
+
426
+ folder = File.join(@out_sutra, ed)
427
+ FileUtils.makedirs(folder)
428
+
429
+ fn = "#{@sutra_no}_%03d.txt" % juan_no
430
+ output_path = File.join(folder, fn)
431
+ File.write(output_path, frag.content)
432
+ end
433
+ end
434
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-11 00:00:00.000000000 Z
11
+ date: 2015-06-23 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
14
14
  email: zhoubx@gmail.com
@@ -23,6 +23,7 @@ files:
23
23
  - lib/cbeta/gaiji.rb
24
24
  - lib/cbeta/html_to_text.rb
25
25
  - lib/cbeta/p5a_to_html.rb
26
+ - lib/cbeta/p5a_to_text.rb
26
27
  - lib/cbeta/unicode-1.1.json
27
28
  homepage: https://github.com/RayCHOU/ruby-cbeta
28
29
  licenses: