cbeta 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +1 -0
- data/lib/cbeta/p5a_to_simple_html.rb +381 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9f5658a7ccfa7ca6434b02a5ef71f576a92553da
|
4
|
+
data.tar.gz: 8cfa8c42a807f60c578880606085ab10cc166aea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88a7cd77d5c37d6071d9260f957c17bbf1cd58c43f8cc0447ab0b6ee19ecf837ce1a729eeda9c6086ff3130b94ce01fabeff3865ea390b74a756188ec6b6fe35
|
7
|
+
data.tar.gz: 0a2fb196c6d18a3e24c0b4e1b6305b3790709f37539d8922b743a139955f0e0e2ebec08ec7ff2658ef2e2c032dd77dbc24341418dab3135aa4ca2303e1e438fe
|
data/lib/cbeta.rb
CHANGED
@@ -0,0 +1,381 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
# Convert CBETA XML P5a to simple HTML
|
8
|
+
#
|
9
|
+
# CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
|
10
|
+
#
|
11
|
+
# @example for convert 大正藏第一冊:
|
12
|
+
#
|
13
|
+
# c = CBETA::P5aToSimpleHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
14
|
+
# c.convert('T01')
|
15
|
+
#
|
16
|
+
class CBETA::P5aToSimpleHTML
|
17
|
+
|
18
|
+
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
19
|
+
# @param output_root [String] 輸出 Text 路徑
|
20
|
+
def initialize(xml_root, output_root)
|
21
|
+
@xml_root = xml_root
|
22
|
+
@output_root = output_root
|
23
|
+
@cbeta = CBETA.new
|
24
|
+
@gaijis = CBETA::Gaiji.new
|
25
|
+
|
26
|
+
# 載入 unicode 1.1 字集列表
|
27
|
+
fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
|
28
|
+
json = File.read(fn)
|
29
|
+
@unicode1 = JSON.parse(json)
|
30
|
+
end
|
31
|
+
|
32
|
+
# 將 CBETA XML P5a 轉為 Text
|
33
|
+
#
|
34
|
+
# @example for convert 大正藏第一冊:
|
35
|
+
#
|
36
|
+
# x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
37
|
+
# x2h.convert('T01')
|
38
|
+
#
|
39
|
+
# @example for convert 大正藏全部:
|
40
|
+
#
|
41
|
+
# x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
42
|
+
# x2h.convert('T')
|
43
|
+
#
|
44
|
+
# @example for convert 大正藏第五冊至第七冊:
|
45
|
+
#
|
46
|
+
# x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
47
|
+
# x2h.convert('T05..T07')
|
48
|
+
#
|
49
|
+
# T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
|
50
|
+
def convert(target=nil)
|
51
|
+
return convert_all if target.nil?
|
52
|
+
|
53
|
+
arg = target.upcase
|
54
|
+
if arg.size == 1
|
55
|
+
handle_collection(arg)
|
56
|
+
else
|
57
|
+
if arg.include? '..'
|
58
|
+
arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
|
59
|
+
handle_vols($1, $2)
|
60
|
+
}
|
61
|
+
else
|
62
|
+
handle_vol(arg)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def convert_all
|
70
|
+
Dir.foreach(@xml_root) { |c|
|
71
|
+
next unless c.match(/^[A-Z]$/)
|
72
|
+
handle_collection(c)
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
def handle_anchor(e)
|
77
|
+
if e.has_attribute?('type')
|
78
|
+
if e['type'] == 'circle'
|
79
|
+
return '◎'
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
''
|
84
|
+
end
|
85
|
+
|
86
|
+
def handle_collection(c)
|
87
|
+
@series = c
|
88
|
+
puts 'handle_collection ' + c
|
89
|
+
folder = File.join(@xml_root, @series)
|
90
|
+
Dir.foreach(folder) { |vol|
|
91
|
+
next if ['.', '..', '.DS_Store'].include? vol
|
92
|
+
handle_vol(vol)
|
93
|
+
}
|
94
|
+
end
|
95
|
+
|
96
|
+
def handle_corr(e)
|
97
|
+
"<r w='【CBETA】'>%s</r>" % traverse(e)
|
98
|
+
end
|
99
|
+
|
100
|
+
def handle_g(e)
|
101
|
+
# if 有 <mapping type="unicode">
|
102
|
+
# 直接採用
|
103
|
+
# else if 有 <mapping type="normal_unicode">
|
104
|
+
# 採用 normal_unicode
|
105
|
+
# else if 有 normalized form
|
106
|
+
# 採用 normalized form
|
107
|
+
# else
|
108
|
+
# Unicode PUA
|
109
|
+
gid = e['ref'][1..-1]
|
110
|
+
g = @gaijis[gid]
|
111
|
+
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
112
|
+
zzs = g['zzs']
|
113
|
+
|
114
|
+
if gid.start_with?('SD')
|
115
|
+
case gid
|
116
|
+
when 'SD-E35A'
|
117
|
+
return '('
|
118
|
+
when 'SD-E35B'
|
119
|
+
return ')'
|
120
|
+
else
|
121
|
+
return g['roman']
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
return g['roman'] if gid.start_with?('RJ')
|
126
|
+
return g['unicode-char'] if g.has_key?('unicode')
|
127
|
+
return g['normal_unicode'] if g.has_key?('normal_unicode')
|
128
|
+
return g['normal'] if g.has_key?('normal')
|
129
|
+
|
130
|
+
# Unicode PUA
|
131
|
+
[0xf0000 + gid[2..-1].to_i].pack 'U'
|
132
|
+
end
|
133
|
+
|
134
|
+
def handle_lb(e)
|
135
|
+
r = "<a id='lb#{e['n']}'/>"
|
136
|
+
unless @next_line_buf.empty?
|
137
|
+
r += @next_line_buf + "\n"
|
138
|
+
@next_line_buf = ''
|
139
|
+
end
|
140
|
+
r
|
141
|
+
end
|
142
|
+
|
143
|
+
def handle_lem(e)
|
144
|
+
r = ''
|
145
|
+
r = traverse(e)
|
146
|
+
w = e['wit'].scan(/【.*?】/)
|
147
|
+
@editions.merge w
|
148
|
+
w = w.join(' ')
|
149
|
+
"<r w='#{w}'>#{r}</r>"
|
150
|
+
end
|
151
|
+
|
152
|
+
def handle_milestone(e)
|
153
|
+
r = ''
|
154
|
+
if e['unit'] == 'juan'
|
155
|
+
@juan = e['n'].to_i
|
156
|
+
r += "<juan #{@juan}>"
|
157
|
+
end
|
158
|
+
r
|
159
|
+
end
|
160
|
+
|
161
|
+
def handle_node(e)
|
162
|
+
return '' if e.comment?
|
163
|
+
return handle_text(e) if e.text?
|
164
|
+
return '' if PASS.include?(e.name)
|
165
|
+
r = case e.name
|
166
|
+
when 'anchor' then handle_anchor(e)
|
167
|
+
when 'back' then ''
|
168
|
+
when 'corr' then handle_corr(e)
|
169
|
+
when 'foreign' then ''
|
170
|
+
when 'g' then handle_g(e)
|
171
|
+
when 'graphic' then ''
|
172
|
+
when 'lb' then handle_lb(e)
|
173
|
+
when 'lem' then handle_lem(e)
|
174
|
+
when 'mulu' then ''
|
175
|
+
when 'note' then handle_note(e)
|
176
|
+
when 'milestone' then handle_milestone(e)
|
177
|
+
when 'rdg' then handle_rdg(e)
|
178
|
+
when 'reg' then ''
|
179
|
+
when 'sic' then handle_sic(e)
|
180
|
+
when 'sg' then handle_sg(e)
|
181
|
+
when 'tt' then handle_tt(e)
|
182
|
+
when 't' then handle_t(e)
|
183
|
+
when 'teiHeader' then ''
|
184
|
+
else traverse(e)
|
185
|
+
end
|
186
|
+
r
|
187
|
+
end
|
188
|
+
|
189
|
+
def handle_note(e)
|
190
|
+
if e.has_attribute?('place') && e['place']=='inline'
|
191
|
+
r = traverse(e)
|
192
|
+
return "(#{r})"
|
193
|
+
end
|
194
|
+
''
|
195
|
+
end
|
196
|
+
|
197
|
+
def handle_rdg(e)
|
198
|
+
r = traverse(e)
|
199
|
+
w = e['wit'].scan(/【.*?】/)
|
200
|
+
@editions.merge w
|
201
|
+
"<r w='#{e['wit']}'>#{r}</r>"
|
202
|
+
end
|
203
|
+
|
204
|
+
def handle_sg(e)
|
205
|
+
'(' + traverse(e) + ')'
|
206
|
+
end
|
207
|
+
|
208
|
+
def handle_sic(e)
|
209
|
+
"<r w='#{@orig}'>" + traverse(e) + "</r>"
|
210
|
+
end
|
211
|
+
|
212
|
+
def handle_sutra(xml_fn)
|
213
|
+
puts "convert sutra #{xml_fn}"
|
214
|
+
@dila_note = 0
|
215
|
+
@div_count = 0
|
216
|
+
@editions = Set.new ["【CBETA】"]
|
217
|
+
@in_l = false
|
218
|
+
@juan = 0
|
219
|
+
@lg_row_open = false
|
220
|
+
@mod_notes = Set.new
|
221
|
+
@next_line_buf = ''
|
222
|
+
@open_divs = []
|
223
|
+
@sutra_no = File.basename(xml_fn, ".xml")
|
224
|
+
|
225
|
+
text = parse_xml(xml_fn)
|
226
|
+
|
227
|
+
# 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c....
|
228
|
+
# 輸出時去掉這些檔尾的 a, b, b....
|
229
|
+
if @sutra_no.match(/^(T05|T06|T07)n0220/)
|
230
|
+
@sutra_no = "#{$1}n0220"
|
231
|
+
end
|
232
|
+
|
233
|
+
@out_sutra = File.join(@out_vol, @sutra_no)
|
234
|
+
FileUtils.makedirs @out_sutra
|
235
|
+
|
236
|
+
juans = text.split(/(<juan \d+>)/)
|
237
|
+
open = false
|
238
|
+
fo = nil
|
239
|
+
juan_no = nil
|
240
|
+
fn = ''
|
241
|
+
buf = ''
|
242
|
+
# 一卷一檔
|
243
|
+
juans.each { |j|
|
244
|
+
if j =~ /<juan (\d+)>$/
|
245
|
+
juan_no = $1.to_i
|
246
|
+
else
|
247
|
+
if juan_no.nil?
|
248
|
+
buf = j
|
249
|
+
else
|
250
|
+
write_juan(juan_no, buf+j)
|
251
|
+
buf = ''
|
252
|
+
end
|
253
|
+
end
|
254
|
+
}
|
255
|
+
end
|
256
|
+
|
257
|
+
def handle_t(e)
|
258
|
+
if e.has_attribute? 'place'
|
259
|
+
return '' if e['place'].include? 'foot'
|
260
|
+
end
|
261
|
+
r = traverse(e)
|
262
|
+
|
263
|
+
# 不是雙行對照
|
264
|
+
return r if @tt_type == 'app'
|
265
|
+
|
266
|
+
# 處理雙行對照
|
267
|
+
i = e.xpath('../t').index(e)
|
268
|
+
case i
|
269
|
+
when 0
|
270
|
+
return r + ' '
|
271
|
+
when 1
|
272
|
+
@next_line_buf += r + ' '
|
273
|
+
return ''
|
274
|
+
else
|
275
|
+
return r
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def handle_text(e)
|
280
|
+
s = e.content().chomp
|
281
|
+
return '' if s.empty?
|
282
|
+
return '' if e.parent.name == 'app'
|
283
|
+
|
284
|
+
# cbeta xml 文字之間會有多餘的換行
|
285
|
+
r = s.gsub(/[\n\r]/, '')
|
286
|
+
|
287
|
+
# 把 & 轉為 &
|
288
|
+
CGI.escapeHTML(r)
|
289
|
+
end
|
290
|
+
|
291
|
+
def handle_tt(e)
|
292
|
+
@tt_type = e['type']
|
293
|
+
traverse(e)
|
294
|
+
end
|
295
|
+
|
296
|
+
def handle_vol(vol)
|
297
|
+
puts "convert volumn: #{vol}"
|
298
|
+
|
299
|
+
@orig = @cbeta.get_canon_abbr(vol[0])
|
300
|
+
abort "未處理底本" if @orig.nil?
|
301
|
+
|
302
|
+
@vol = vol
|
303
|
+
@series = vol[0]
|
304
|
+
@out_vol = File.join(@output_root, @series, vol)
|
305
|
+
FileUtils.remove_dir(@out_vol, force=true)
|
306
|
+
FileUtils.makedirs @out_vol
|
307
|
+
|
308
|
+
source = File.join(@xml_root, @series, vol)
|
309
|
+
Dir[source+"/*"].each { |f|
|
310
|
+
handle_sutra(f)
|
311
|
+
}
|
312
|
+
end
|
313
|
+
|
314
|
+
def handle_vols(v1, v2)
|
315
|
+
puts "convert volumns: #{v1}..#{v2}"
|
316
|
+
@series = v1[0]
|
317
|
+
folder = File.join(@xml_root, @series)
|
318
|
+
Dir.foreach(folder) { |vol|
|
319
|
+
next if vol < v1
|
320
|
+
next if vol > v2
|
321
|
+
handle_vol(vol)
|
322
|
+
}
|
323
|
+
end
|
324
|
+
|
325
|
+
def open_xml(fn)
|
326
|
+
s = File.read(fn)
|
327
|
+
doc = Nokogiri::XML(s)
|
328
|
+
doc.remove_namespaces!()
|
329
|
+
doc
|
330
|
+
end
|
331
|
+
|
332
|
+
def parse_xml(xml_fn)
|
333
|
+
doc = open_xml(xml_fn)
|
334
|
+
root = doc.root()
|
335
|
+
|
336
|
+
body = root.xpath("text/body")[0]
|
337
|
+
traverse(body)
|
338
|
+
end
|
339
|
+
|
340
|
+
def traverse(e)
|
341
|
+
r = ''
|
342
|
+
e.children.each { |c|
|
343
|
+
s = handle_node(c)
|
344
|
+
r += s
|
345
|
+
}
|
346
|
+
r
|
347
|
+
end
|
348
|
+
|
349
|
+
def write_juan(juan_no, txt)
|
350
|
+
folder = File.join(@out_sutra, "%03d" % juan_no)
|
351
|
+
FileUtils.makedirs(folder)
|
352
|
+
@editions.each do |ed|
|
353
|
+
frag = Nokogiri::XML.fragment(txt)
|
354
|
+
frag.search("r").each do |node|
|
355
|
+
if node['w'] == ed
|
356
|
+
node.add_previous_sibling(node.text)
|
357
|
+
end
|
358
|
+
node.remove
|
359
|
+
end
|
360
|
+
|
361
|
+
text = <<-END.gsub(/^\s+\|/, '')
|
362
|
+
|<!DOCTYPE html>
|
363
|
+
|<html>
|
364
|
+
|<head>
|
365
|
+
| <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
366
|
+
|</head>
|
367
|
+
|<body>
|
368
|
+
END
|
369
|
+
text += to_html(frag) + "</body></html>"
|
370
|
+
|
371
|
+
fn = ed.sub(/^【(.*?)】$/, '\1')
|
372
|
+
fn = "#{fn}.html"
|
373
|
+
output_path = File.join(folder, fn)
|
374
|
+
File.write(output_path, text)
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
def to_html(e)
|
379
|
+
e.to_xml(encoding: 'UTF-8', :save_with => Nokogiri::XML::Node::SaveOptions::AS_XML)
|
380
|
+
end
|
381
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08-
|
11
|
+
date: 2015-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|
@@ -23,6 +23,7 @@ files:
|
|
23
23
|
- lib/cbeta/gaiji.rb
|
24
24
|
- lib/cbeta/html_to_text.rb
|
25
25
|
- lib/cbeta/p5a_to_html.rb
|
26
|
+
- lib/cbeta/p5a_to_simple_html.rb
|
26
27
|
- lib/cbeta/p5a_to_text.rb
|
27
28
|
- lib/cbeta/unicode-1.1.json
|
28
29
|
homepage: https://github.com/RayCHOU/ruby-cbeta
|