cbeta 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +1 -0
- data/lib/cbeta/p5a_to_text.rb +434 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8fa52e9b0b8aedcc963fb3fe04e7671e11195136
|
4
|
+
data.tar.gz: ce0abdc6a26880da654608dd5e23bdb227e28e54
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e201a6601286381216794fd9cf704785a01339782813395f51331037028b57d498a3c58852e1ce5ffcd2597ae5eec356be86f10c55c3e08619cd230e78019bb4
|
7
|
+
data.tar.gz: b3487594d36f4e698f2bd61ce75a69c7d057975fa20e68683293782e580f43ede41afc4619822a49447aad5bac8f1a6d19d22f7e7e0c9d68b3f7f6f62d1b89e9
|
data/lib/cbeta.rb
CHANGED
@@ -0,0 +1,434 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'date'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'json'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
# Convert CBETA XML P5a to Text
|
9
|
+
#
|
10
|
+
# CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
|
11
|
+
class CBETA::P5aToText
|
12
|
+
|
13
|
+
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
14
|
+
# @param output_root [String] 輸出 Text 路徑
|
15
|
+
def initialize(xml_root, output_root)
|
16
|
+
@xml_root = xml_root
|
17
|
+
@output_root = output_root
|
18
|
+
@cbeta = CBETA.new
|
19
|
+
@gaijis = CBETA::Gaiji.new
|
20
|
+
|
21
|
+
# 載入 unicode 1.1 字集列表
|
22
|
+
fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
|
23
|
+
json = File.read(fn)
|
24
|
+
@unicode1 = JSON.parse(json)
|
25
|
+
end
|
26
|
+
|
27
|
+
# 將 CBETA XML P5a 轉為 Text
|
28
|
+
#
|
29
|
+
# @example for convert 大正藏第一冊:
|
30
|
+
#
|
31
|
+
# x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
32
|
+
# x2h.convert('T01')
|
33
|
+
#
|
34
|
+
# @example for convert 大正藏全部:
|
35
|
+
#
|
36
|
+
# x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
37
|
+
# x2h.convert('T')
|
38
|
+
#
|
39
|
+
# @example for convert 大正藏第五冊至第七冊:
|
40
|
+
#
|
41
|
+
# x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
42
|
+
# x2h.convert('T05..T07')
|
43
|
+
#
|
44
|
+
# T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
|
45
|
+
def convert(target=nil)
|
46
|
+
return convert_all if target.nil?
|
47
|
+
|
48
|
+
arg = target.upcase
|
49
|
+
if arg.size == 1
|
50
|
+
handle_collection(arg)
|
51
|
+
else
|
52
|
+
if arg.include? '..'
|
53
|
+
arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
|
54
|
+
handle_vols($1, $2)
|
55
|
+
}
|
56
|
+
else
|
57
|
+
handle_vol(arg)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def convert_all
|
65
|
+
Dir.foreach(@xml_root) { |c|
|
66
|
+
next unless c.match(/^[A-Z]$/)
|
67
|
+
handle_collection(c)
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
71
|
+
def handle_anchor(e)
|
72
|
+
if e.has_attribute?('type')
|
73
|
+
if e['type'] == 'circle'
|
74
|
+
return '◎'
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
''
|
79
|
+
end
|
80
|
+
|
81
|
+
def handle_app(e)
|
82
|
+
traverse(e)
|
83
|
+
end
|
84
|
+
|
85
|
+
def handle_byline(e)
|
86
|
+
traverse(e) + "\n"
|
87
|
+
end
|
88
|
+
|
89
|
+
def handle_cell(e)
|
90
|
+
traverse(e) + "\n"
|
91
|
+
end
|
92
|
+
|
93
|
+
def handle_collection(c)
|
94
|
+
@series = c
|
95
|
+
puts 'handle_collection ' + c
|
96
|
+
folder = File.join(@xml_root, @series)
|
97
|
+
Dir.foreach(folder) { |vol|
|
98
|
+
next if ['.', '..', '.DS_Store'].include? vol
|
99
|
+
handle_vol(vol)
|
100
|
+
}
|
101
|
+
end
|
102
|
+
|
103
|
+
def handle_corr(e)
|
104
|
+
"<r w='【CBETA】'>%s</r>" % traverse(e)
|
105
|
+
end
|
106
|
+
|
107
|
+
def handle_div(e)
|
108
|
+
traverse(e)
|
109
|
+
end
|
110
|
+
|
111
|
+
def handle_figure(e)
|
112
|
+
traverse(e) + "\n"
|
113
|
+
end
|
114
|
+
|
115
|
+
def handle_g(e)
|
116
|
+
# if 有 <mapping type="unicode">
|
117
|
+
# 直接採用
|
118
|
+
# else if 有 <mapping type="normal_unicode">
|
119
|
+
# 採用 normal_unicode
|
120
|
+
# else if 有 normalized form
|
121
|
+
# 採用 normalized form
|
122
|
+
# else
|
123
|
+
# Unicode PUA
|
124
|
+
gid = e['ref'][1..-1]
|
125
|
+
g = @gaijis[gid]
|
126
|
+
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
127
|
+
zzs = g['zzs']
|
128
|
+
|
129
|
+
if gid.start_with?('SD')
|
130
|
+
case gid
|
131
|
+
when 'SD-E35A'
|
132
|
+
return '('
|
133
|
+
when 'SD-E35B'
|
134
|
+
return ')'
|
135
|
+
else
|
136
|
+
return g['roman']
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
return g['roman'] if gid.start_with?('RJ')
|
141
|
+
return g['unicode-char'] if g.has_key?('unicode')
|
142
|
+
return g['normal_unicode'] if g.has_key?('normal_unicode')
|
143
|
+
return g['normal'] if g.has_key?('normal')
|
144
|
+
|
145
|
+
# Unicode PUA
|
146
|
+
[0xf0000 + gid[2..-1].to_i].pack 'U'
|
147
|
+
end
|
148
|
+
|
149
|
+
def handle_graphic(e)
|
150
|
+
''
|
151
|
+
end
|
152
|
+
|
153
|
+
def handle_head(e)
|
154
|
+
traverse(e) + "\n"
|
155
|
+
end
|
156
|
+
|
157
|
+
def handle_item(e)
|
158
|
+
traverse(e) + "\n"
|
159
|
+
end
|
160
|
+
|
161
|
+
def handle_juan(e)
|
162
|
+
traverse(e) + "\n"
|
163
|
+
end
|
164
|
+
|
165
|
+
def handle_l(e)
|
166
|
+
r = traverse(e)
|
167
|
+
unless @lg_type == 'abnormal'
|
168
|
+
r += "\n"
|
169
|
+
end
|
170
|
+
r
|
171
|
+
end
|
172
|
+
|
173
|
+
def handle_lb(e)
|
174
|
+
r = ''
|
175
|
+
unless @next_line_buf.empty?
|
176
|
+
r += @next_line_buf + "\n"
|
177
|
+
@next_line_buf = ''
|
178
|
+
end
|
179
|
+
r
|
180
|
+
end
|
181
|
+
|
182
|
+
def handle_lem(e)
|
183
|
+
r = ''
|
184
|
+
r = traverse(e)
|
185
|
+
w = e['wit'].scan(/【.*?】/)
|
186
|
+
@editions.merge w
|
187
|
+
w = w.join(' ')
|
188
|
+
"<r w='#{w}'>#{r}</r>"
|
189
|
+
end
|
190
|
+
|
191
|
+
def handle_lg(e)
|
192
|
+
traverse(e)
|
193
|
+
end
|
194
|
+
|
195
|
+
def handle_list(e)
|
196
|
+
"\n" + traverse(e)
|
197
|
+
end
|
198
|
+
|
199
|
+
def handle_milestone(e)
|
200
|
+
r = ''
|
201
|
+
if e['unit'] == 'juan'
|
202
|
+
@juan = e['n'].to_i
|
203
|
+
r += "<juan #{@juan}>"
|
204
|
+
end
|
205
|
+
r
|
206
|
+
end
|
207
|
+
|
208
|
+
def handle_mulu(e)
|
209
|
+
''
|
210
|
+
end
|
211
|
+
|
212
|
+
def handle_node(e)
|
213
|
+
return '' if e.comment?
|
214
|
+
return handle_text(e) if e.text?
|
215
|
+
return '' if PASS.include?(e.name)
|
216
|
+
r = case e.name
|
217
|
+
when 'anchor' then handle_anchor(e)
|
218
|
+
when 'app' then handle_app(e)
|
219
|
+
when 'back' then ''
|
220
|
+
when 'byline' then handle_byline(e)
|
221
|
+
when 'cell' then handle_cell(e)
|
222
|
+
when 'corr' then handle_corr(e)
|
223
|
+
when 'div' then handle_div(e)
|
224
|
+
when 'figure' then handle_figure(e)
|
225
|
+
when 'foreign' then ''
|
226
|
+
when 'g' then handle_g(e)
|
227
|
+
when 'graphic' then handle_graphic(e)
|
228
|
+
when 'head' then handle_head(e)
|
229
|
+
when 'item' then handle_item(e)
|
230
|
+
when 'juan' then handle_juan(e)
|
231
|
+
when 'l' then handle_l(e)
|
232
|
+
when 'lb' then handle_lb(e)
|
233
|
+
when 'lem' then handle_lem(e)
|
234
|
+
when 'lg' then handle_lg(e)
|
235
|
+
when 'list' then handle_list(e)
|
236
|
+
when 'mulu' then handle_mulu(e)
|
237
|
+
when 'note' then handle_note(e)
|
238
|
+
when 'milestone' then handle_milestone(e)
|
239
|
+
when 'p' then handle_p(e)
|
240
|
+
when 'rdg' then handle_rdg(e)
|
241
|
+
when 'reg' then ''
|
242
|
+
when 'row' then handle_row(e)
|
243
|
+
when 'sic' then handle_sic(e)
|
244
|
+
when 'sg' then handle_sg(e)
|
245
|
+
when 't' then handle_t(e)
|
246
|
+
when 'table' then handle_table(e)
|
247
|
+
when 'teiHeader' then ''
|
248
|
+
else traverse(e)
|
249
|
+
end
|
250
|
+
r
|
251
|
+
end
|
252
|
+
|
253
|
+
def handle_note(e)
|
254
|
+
if e.has_attribute?('place') && e['place']=='inline'
|
255
|
+
r = traverse(e)
|
256
|
+
return "(#{r})"
|
257
|
+
end
|
258
|
+
''
|
259
|
+
end
|
260
|
+
|
261
|
+
def handle_p(e)
|
262
|
+
traverse(e) + "\n"
|
263
|
+
end
|
264
|
+
|
265
|
+
def handle_rdg(e)
|
266
|
+
r = traverse(e)
|
267
|
+
w = e['wit'].scan(/【.*?】/)
|
268
|
+
@editions.merge w
|
269
|
+
"<r w='#{e['wit']}'>#{r}</r>"
|
270
|
+
end
|
271
|
+
|
272
|
+
def handle_row(e)
|
273
|
+
traverse(e)
|
274
|
+
end
|
275
|
+
|
276
|
+
def handle_sg(e)
|
277
|
+
'(' + traverse(e) + ')'
|
278
|
+
end
|
279
|
+
|
280
|
+
def handle_sic(e)
|
281
|
+
"<r w='#{@orig}'>" + traverse(e) + "</r>"
|
282
|
+
end
|
283
|
+
|
284
|
+
def handle_sutra(xml_fn)
|
285
|
+
puts "convert sutra #{xml_fn}"
|
286
|
+
@dila_note = 0
|
287
|
+
@div_count = 0
|
288
|
+
@editions = Set.new ["【CBETA】"]
|
289
|
+
@in_l = false
|
290
|
+
@juan = 0
|
291
|
+
@lg_row_open = false
|
292
|
+
@mod_notes = Set.new
|
293
|
+
@next_line_buf = ''
|
294
|
+
@open_divs = []
|
295
|
+
@sutra_no = File.basename(xml_fn, ".xml")
|
296
|
+
|
297
|
+
text = parse_xml(xml_fn)
|
298
|
+
|
299
|
+
# 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c....
|
300
|
+
# 輸出時去掉這些檔尾的 a, b, b....
|
301
|
+
if @sutra_no.match(/^(T05|T06|T07)n0220/)
|
302
|
+
@sutra_no = "#{$1}n0220"
|
303
|
+
end
|
304
|
+
|
305
|
+
@out_sutra = File.join(@out_vol, @sutra_no)
|
306
|
+
FileUtils.makedirs @out_sutra
|
307
|
+
|
308
|
+
juans = text.split(/(<juan \d+>)/)
|
309
|
+
open = false
|
310
|
+
fo = nil
|
311
|
+
juan_no = nil
|
312
|
+
fn = ''
|
313
|
+
buf = ''
|
314
|
+
# 一卷一檔
|
315
|
+
juans.each { |j|
|
316
|
+
if j =~ /<juan (\d+)>$/
|
317
|
+
juan_no = $1.to_i
|
318
|
+
else
|
319
|
+
if juan_no.nil?
|
320
|
+
buf = j
|
321
|
+
else
|
322
|
+
write_juan(juan_no, buf+j)
|
323
|
+
buf = ''
|
324
|
+
end
|
325
|
+
end
|
326
|
+
}
|
327
|
+
end
|
328
|
+
|
329
|
+
def handle_t(e)
|
330
|
+
if e.has_attribute? 'place'
|
331
|
+
return '' if e['place'].include? 'foot'
|
332
|
+
end
|
333
|
+
r = traverse(e)
|
334
|
+
|
335
|
+
# 處理雙行對照
|
336
|
+
i = e.xpath('../t').index(e)
|
337
|
+
case i
|
338
|
+
when 0
|
339
|
+
return r + ' '
|
340
|
+
when 1
|
341
|
+
@next_line_buf += r + ' '
|
342
|
+
return ''
|
343
|
+
else
|
344
|
+
return r
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
def handle_table(e)
|
349
|
+
traverse(e)
|
350
|
+
end
|
351
|
+
|
352
|
+
def handle_text(e)
|
353
|
+
s = e.content().chomp
|
354
|
+
return '' if s.empty?
|
355
|
+
return '' if e.parent.name == 'app'
|
356
|
+
|
357
|
+
# cbeta xml 文字之間會有多餘的換行
|
358
|
+
r = s.gsub(/[\n\r]/, '')
|
359
|
+
|
360
|
+
# 把 & 轉為 &
|
361
|
+
CGI.escapeHTML(r)
|
362
|
+
end
|
363
|
+
|
364
|
+
def handle_vol(vol)
|
365
|
+
puts "convert volumn: #{vol}"
|
366
|
+
|
367
|
+
@orig = @cbeta.get_canon_abbr(vol[0])
|
368
|
+
abort "未處理底本" if @orig.nil?
|
369
|
+
|
370
|
+
@vol = vol
|
371
|
+
@series = vol[0]
|
372
|
+
@out_vol = File.join(@output_root, @series, vol)
|
373
|
+
FileUtils.remove_dir(@out_vol, force=true)
|
374
|
+
FileUtils.makedirs @out_vol
|
375
|
+
|
376
|
+
source = File.join(@xml_root, @series, vol)
|
377
|
+
Dir[source+"/*"].each { |f|
|
378
|
+
handle_sutra(f)
|
379
|
+
}
|
380
|
+
end
|
381
|
+
|
382
|
+
def handle_vols(v1, v2)
|
383
|
+
puts "convert volumns: #{v1}..#{v2}"
|
384
|
+
@series = v1[0]
|
385
|
+
folder = File.join(@xml_root, @series)
|
386
|
+
Dir.foreach(folder) { |vol|
|
387
|
+
next if vol < v1
|
388
|
+
next if vol > v2
|
389
|
+
handle_vol(vol)
|
390
|
+
}
|
391
|
+
end
|
392
|
+
|
393
|
+
def open_xml(fn)
|
394
|
+
s = File.read(fn)
|
395
|
+
doc = Nokogiri::XML(s)
|
396
|
+
doc.remove_namespaces!()
|
397
|
+
doc
|
398
|
+
end
|
399
|
+
|
400
|
+
def parse_xml(xml_fn)
|
401
|
+
doc = open_xml(xml_fn)
|
402
|
+
root = doc.root()
|
403
|
+
|
404
|
+
body = root.xpath("text/body")[0]
|
405
|
+
traverse(body)
|
406
|
+
end
|
407
|
+
|
408
|
+
def traverse(e)
|
409
|
+
r = ''
|
410
|
+
e.children.each { |c|
|
411
|
+
s = handle_node(c)
|
412
|
+
r += s
|
413
|
+
}
|
414
|
+
r
|
415
|
+
end
|
416
|
+
|
417
|
+
def write_juan(juan_no, txt)
|
418
|
+
@editions.each do |ed|
|
419
|
+
frag = Nokogiri::XML.fragment(txt)
|
420
|
+
frag.search("r").each do |node|
|
421
|
+
if node['w'] != ed
|
422
|
+
node.remove
|
423
|
+
end
|
424
|
+
end
|
425
|
+
|
426
|
+
folder = File.join(@out_sutra, ed)
|
427
|
+
FileUtils.makedirs(folder)
|
428
|
+
|
429
|
+
fn = "#{@sutra_no}_%03d.txt" % juan_no
|
430
|
+
output_path = File.join(folder, fn)
|
431
|
+
File.write(output_path, frag.content)
|
432
|
+
end
|
433
|
+
end
|
434
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|
@@ -23,6 +23,7 @@ files:
|
|
23
23
|
- lib/cbeta/gaiji.rb
|
24
24
|
- lib/cbeta/html_to_text.rb
|
25
25
|
- lib/cbeta/p5a_to_html.rb
|
26
|
+
- lib/cbeta/p5a_to_text.rb
|
26
27
|
- lib/cbeta/unicode-1.1.json
|
27
28
|
homepage: https://github.com/RayCHOU/ruby-cbeta
|
28
29
|
licenses:
|