cbeta 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2c7d0e6840d7cffe12511711a2eceb70238eb409
4
- data.tar.gz: c1cd4e44a14ff12d1cc064070755108ba038b584
3
+ metadata.gz: 9f5658a7ccfa7ca6434b02a5ef71f576a92553da
4
+ data.tar.gz: 8cfa8c42a807f60c578880606085ab10cc166aea
5
5
  SHA512:
6
- metadata.gz: 90d25d4388575498f7ddfc80fcd5ef93e651a120142d915cf5c3f58e32344d55ec51eebc6f7d19b87a44dd051d8bb1d3394f48a15b4e5487ffd060077fafecfe
7
- data.tar.gz: 98d334a45c027e06aaea09ee0dfcf68eefc1e9d96e83016b346c794b145b9bff33ad0f564c824e1f6dc4fb8eec99a168a12aacde9cd84059cb1fcb8bbed3dcbf
6
+ metadata.gz: 88a7cd77d5c37d6071d9260f957c17bbf1cd58c43f8cc0447ab0b6ee19ecf837ce1a729eeda9c6086ff3130b94ce01fabeff3865ea390b74a756188ec6b6fe35
7
+ data.tar.gz: 0a2fb196c6d18a3e24c0b4e1b6305b3790709f37539d8922b743a139955f0e0e2ebec08ec7ff2658ef2e2c032dd77dbc24341418dab3135aa4ca2303e1e438fe
@@ -50,5 +50,6 @@ end
50
50
  require 'cbeta/gaiji'
51
51
  require 'cbeta/bm_to_text'
52
52
  require 'cbeta/p5a_to_html'
53
+ require 'cbeta/p5a_to_simple_html'
53
54
  require 'cbeta/p5a_to_text'
54
55
  require 'cbeta/html_to_text'
@@ -0,0 +1,381 @@
1
+ require 'cgi'
2
+ require 'fileutils'
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'set'
6
+
7
+ # Convert CBETA XML P5a to simple HTML
8
+ #
9
+ # CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
10
+ #
11
+ # @example for convert 大正藏第一冊:
12
+ #
13
+ # c = CBETA::P5aToSimpleHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
14
+ # c.convert('T01')
15
+ #
16
+ class CBETA::P5aToSimpleHTML
17
+
18
+ # @param xml_root [String] 來源 CBETA XML P5a 路徑
19
+ # @param output_root [String] 輸出 Text 路徑
20
+ def initialize(xml_root, output_root)
21
+ @xml_root = xml_root
22
+ @output_root = output_root
23
+ @cbeta = CBETA.new
24
+ @gaijis = CBETA::Gaiji.new
25
+
26
+ # 載入 unicode 1.1 字集列表
27
+ fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
28
+ json = File.read(fn)
29
+ @unicode1 = JSON.parse(json)
30
+ end
31
+
32
+ # 將 CBETA XML P5a 轉為 Text
33
+ #
34
+ # @example for convert 大正藏第一冊:
35
+ #
36
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
37
+ # x2h.convert('T01')
38
+ #
39
+ # @example for convert 大正藏全部:
40
+ #
41
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
42
+ # x2h.convert('T')
43
+ #
44
+ # @example for convert 大正藏第五冊至第七冊:
45
+ #
46
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
47
+ # x2h.convert('T05..T07')
48
+ #
49
+ # T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
50
+ def convert(target=nil)
51
+ return convert_all if target.nil?
52
+
53
+ arg = target.upcase
54
+ if arg.size == 1
55
+ handle_collection(arg)
56
+ else
57
+ if arg.include? '..'
58
+ arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
59
+ handle_vols($1, $2)
60
+ }
61
+ else
62
+ handle_vol(arg)
63
+ end
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ def convert_all
70
+ Dir.foreach(@xml_root) { |c|
71
+ next unless c.match(/^[A-Z]$/)
72
+ handle_collection(c)
73
+ }
74
+ end
75
+
76
+ def handle_anchor(e)
77
+ if e.has_attribute?('type')
78
+ if e['type'] == 'circle'
79
+ return '◎'
80
+ end
81
+ end
82
+
83
+ ''
84
+ end
85
+
86
+ def handle_collection(c)
87
+ @series = c
88
+ puts 'handle_collection ' + c
89
+ folder = File.join(@xml_root, @series)
90
+ Dir.foreach(folder) { |vol|
91
+ next if ['.', '..', '.DS_Store'].include? vol
92
+ handle_vol(vol)
93
+ }
94
+ end
95
+
96
+ def handle_corr(e)
97
+ "<r w='【CBETA】'>%s</r>" % traverse(e)
98
+ end
99
+
100
+ def handle_g(e)
101
+ # if 有 <mapping type="unicode">
102
+ # 直接採用
103
+ # else if 有 <mapping type="normal_unicode">
104
+ # 採用 normal_unicode
105
+ # else if 有 normalized form
106
+ # 採用 normalized form
107
+ # else
108
+ # Unicode PUA
109
+ gid = e['ref'][1..-1]
110
+ g = @gaijis[gid]
111
+ abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
112
+ zzs = g['zzs']
113
+
114
+ if gid.start_with?('SD')
115
+ case gid
116
+ when 'SD-E35A'
117
+ return '('
118
+ when 'SD-E35B'
119
+ return ')'
120
+ else
121
+ return g['roman']
122
+ end
123
+ end
124
+
125
+ return g['roman'] if gid.start_with?('RJ')
126
+ return g['unicode-char'] if g.has_key?('unicode')
127
+ return g['normal_unicode'] if g.has_key?('normal_unicode')
128
+ return g['normal'] if g.has_key?('normal')
129
+
130
+ # Unicode PUA
131
+ [0xf0000 + gid[2..-1].to_i].pack 'U'
132
+ end
133
+
134
+ def handle_lb(e)
135
+ r = "<a id='lb#{e['n']}'/>"
136
+ unless @next_line_buf.empty?
137
+ r += @next_line_buf + "\n"
138
+ @next_line_buf = ''
139
+ end
140
+ r
141
+ end
142
+
143
+ def handle_lem(e)
144
+ r = ''
145
+ r = traverse(e)
146
+ w = e['wit'].scan(/【.*?】/)
147
+ @editions.merge w
148
+ w = w.join(' ')
149
+ "<r w='#{w}'>#{r}</r>"
150
+ end
151
+
152
+ def handle_milestone(e)
153
+ r = ''
154
+ if e['unit'] == 'juan'
155
+ @juan = e['n'].to_i
156
+ r += "<juan #{@juan}>"
157
+ end
158
+ r
159
+ end
160
+
161
+ def handle_node(e)
162
+ return '' if e.comment?
163
+ return handle_text(e) if e.text?
164
+ return '' if PASS.include?(e.name)
165
+ r = case e.name
166
+ when 'anchor' then handle_anchor(e)
167
+ when 'back' then ''
168
+ when 'corr' then handle_corr(e)
169
+ when 'foreign' then ''
170
+ when 'g' then handle_g(e)
171
+ when 'graphic' then ''
172
+ when 'lb' then handle_lb(e)
173
+ when 'lem' then handle_lem(e)
174
+ when 'mulu' then ''
175
+ when 'note' then handle_note(e)
176
+ when 'milestone' then handle_milestone(e)
177
+ when 'rdg' then handle_rdg(e)
178
+ when 'reg' then ''
179
+ when 'sic' then handle_sic(e)
180
+ when 'sg' then handle_sg(e)
181
+ when 'tt' then handle_tt(e)
182
+ when 't' then handle_t(e)
183
+ when 'teiHeader' then ''
184
+ else traverse(e)
185
+ end
186
+ r
187
+ end
188
+
189
+ def handle_note(e)
190
+ if e.has_attribute?('place') && e['place']=='inline'
191
+ r = traverse(e)
192
+ return "(#{r})"
193
+ end
194
+ ''
195
+ end
196
+
197
+ def handle_rdg(e)
198
+ r = traverse(e)
199
+ w = e['wit'].scan(/【.*?】/)
200
+ @editions.merge w
201
+ "<r w='#{e['wit']}'>#{r}</r>"
202
+ end
203
+
204
+ def handle_sg(e)
205
+ '(' + traverse(e) + ')'
206
+ end
207
+
208
+ def handle_sic(e)
209
+ "<r w='#{@orig}'>" + traverse(e) + "</r>"
210
+ end
211
+
212
+ def handle_sutra(xml_fn)
213
+ puts "convert sutra #{xml_fn}"
214
+ @dila_note = 0
215
+ @div_count = 0
216
+ @editions = Set.new ["【CBETA】"]
217
+ @in_l = false
218
+ @juan = 0
219
+ @lg_row_open = false
220
+ @mod_notes = Set.new
221
+ @next_line_buf = ''
222
+ @open_divs = []
223
+ @sutra_no = File.basename(xml_fn, ".xml")
224
+
225
+ text = parse_xml(xml_fn)
226
+
227
+ # 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c....
228
+ # 輸出時去掉這些檔尾的 a, b, b....
229
+ if @sutra_no.match(/^(T05|T06|T07)n0220/)
230
+ @sutra_no = "#{$1}n0220"
231
+ end
232
+
233
+ @out_sutra = File.join(@out_vol, @sutra_no)
234
+ FileUtils.makedirs @out_sutra
235
+
236
+ juans = text.split(/(<juan \d+>)/)
237
+ open = false
238
+ fo = nil
239
+ juan_no = nil
240
+ fn = ''
241
+ buf = ''
242
+ # 一卷一檔
243
+ juans.each { |j|
244
+ if j =~ /<juan (\d+)>$/
245
+ juan_no = $1.to_i
246
+ else
247
+ if juan_no.nil?
248
+ buf = j
249
+ else
250
+ write_juan(juan_no, buf+j)
251
+ buf = ''
252
+ end
253
+ end
254
+ }
255
+ end
256
+
257
+ def handle_t(e)
258
+ if e.has_attribute? 'place'
259
+ return '' if e['place'].include? 'foot'
260
+ end
261
+ r = traverse(e)
262
+
263
+ # 不是雙行對照
264
+ return r if @tt_type == 'app'
265
+
266
+ # 處理雙行對照
267
+ i = e.xpath('../t').index(e)
268
+ case i
269
+ when 0
270
+ return r + ' '
271
+ when 1
272
+ @next_line_buf += r + ' '
273
+ return ''
274
+ else
275
+ return r
276
+ end
277
+ end
278
+
279
+ def handle_text(e)
280
+ s = e.content().chomp
281
+ return '' if s.empty?
282
+ return '' if e.parent.name == 'app'
283
+
284
+ # cbeta xml 文字之間會有多餘的換行
285
+ r = s.gsub(/[\n\r]/, '')
286
+
287
+ # 把 & 轉為 &amp;
288
+ CGI.escapeHTML(r)
289
+ end
290
+
291
+ def handle_tt(e)
292
+ @tt_type = e['type']
293
+ traverse(e)
294
+ end
295
+
296
+ def handle_vol(vol)
297
+ puts "convert volumn: #{vol}"
298
+
299
+ @orig = @cbeta.get_canon_abbr(vol[0])
300
+ abort "未處理底本" if @orig.nil?
301
+
302
+ @vol = vol
303
+ @series = vol[0]
304
+ @out_vol = File.join(@output_root, @series, vol)
305
+ FileUtils.remove_dir(@out_vol, force=true)
306
+ FileUtils.makedirs @out_vol
307
+
308
+ source = File.join(@xml_root, @series, vol)
309
+ Dir[source+"/*"].each { |f|
310
+ handle_sutra(f)
311
+ }
312
+ end
313
+
314
+ def handle_vols(v1, v2)
315
+ puts "convert volumns: #{v1}..#{v2}"
316
+ @series = v1[0]
317
+ folder = File.join(@xml_root, @series)
318
+ Dir.foreach(folder) { |vol|
319
+ next if vol < v1
320
+ next if vol > v2
321
+ handle_vol(vol)
322
+ }
323
+ end
324
+
325
+ def open_xml(fn)
326
+ s = File.read(fn)
327
+ doc = Nokogiri::XML(s)
328
+ doc.remove_namespaces!()
329
+ doc
330
+ end
331
+
332
+ def parse_xml(xml_fn)
333
+ doc = open_xml(xml_fn)
334
+ root = doc.root()
335
+
336
+ body = root.xpath("text/body")[0]
337
+ traverse(body)
338
+ end
339
+
340
+ def traverse(e)
341
+ r = ''
342
+ e.children.each { |c|
343
+ s = handle_node(c)
344
+ r += s
345
+ }
346
+ r
347
+ end
348
+
349
+ def write_juan(juan_no, txt)
350
+ folder = File.join(@out_sutra, "%03d" % juan_no)
351
+ FileUtils.makedirs(folder)
352
+ @editions.each do |ed|
353
+ frag = Nokogiri::XML.fragment(txt)
354
+ frag.search("r").each do |node|
355
+ if node['w'] == ed
356
+ node.add_previous_sibling(node.text)
357
+ end
358
+ node.remove
359
+ end
360
+
361
+ text = <<-END.gsub(/^\s+\|/, '')
362
+ |<!DOCTYPE html>
363
+ |<html>
364
+ |<head>
365
+ | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
366
+ |</head>
367
+ |<body>
368
+ END
369
+ text += to_html(frag) + "</body></html>"
370
+
371
+ fn = ed.sub(/^【(.*?)】$/, '\1')
372
+ fn = "#{fn}.html"
373
+ output_path = File.join(folder, fn)
374
+ File.write(output_path, text)
375
+ end
376
+ end
377
+
378
+ def to_html(e)
379
+ e.to_xml(encoding: 'UTF-8', :save_with => Nokogiri::XML::Node::SaveOptions::AS_XML)
380
+ end
381
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-12 00:00:00.000000000 Z
11
+ date: 2015-08-13 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
14
14
  email: zhoubx@gmail.com
@@ -23,6 +23,7 @@ files:
23
23
  - lib/cbeta/gaiji.rb
24
24
  - lib/cbeta/html_to_text.rb
25
25
  - lib/cbeta/p5a_to_html.rb
26
+ - lib/cbeta/p5a_to_simple_html.rb
26
27
  - lib/cbeta/p5a_to_text.rb
27
28
  - lib/cbeta/unicode-1.1.json
28
29
  homepage: https://github.com/RayCHOU/ruby-cbeta