cbeta 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2c7d0e6840d7cffe12511711a2eceb70238eb409
4
- data.tar.gz: c1cd4e44a14ff12d1cc064070755108ba038b584
3
+ metadata.gz: 9f5658a7ccfa7ca6434b02a5ef71f576a92553da
4
+ data.tar.gz: 8cfa8c42a807f60c578880606085ab10cc166aea
5
5
  SHA512:
6
- metadata.gz: 90d25d4388575498f7ddfc80fcd5ef93e651a120142d915cf5c3f58e32344d55ec51eebc6f7d19b87a44dd051d8bb1d3394f48a15b4e5487ffd060077fafecfe
7
- data.tar.gz: 98d334a45c027e06aaea09ee0dfcf68eefc1e9d96e83016b346c794b145b9bff33ad0f564c824e1f6dc4fb8eec99a168a12aacde9cd84059cb1fcb8bbed3dcbf
6
+ metadata.gz: 88a7cd77d5c37d6071d9260f957c17bbf1cd58c43f8cc0447ab0b6ee19ecf837ce1a729eeda9c6086ff3130b94ce01fabeff3865ea390b74a756188ec6b6fe35
7
+ data.tar.gz: 0a2fb196c6d18a3e24c0b4e1b6305b3790709f37539d8922b743a139955f0e0e2ebec08ec7ff2658ef2e2c032dd77dbc24341418dab3135aa4ca2303e1e438fe
@@ -50,5 +50,6 @@ end
50
50
  require 'cbeta/gaiji'
51
51
  require 'cbeta/bm_to_text'
52
52
  require 'cbeta/p5a_to_html'
53
+ require 'cbeta/p5a_to_simple_html'
53
54
  require 'cbeta/p5a_to_text'
54
55
  require 'cbeta/html_to_text'
@@ -0,0 +1,381 @@
1
+ require 'cgi'
2
+ require 'fileutils'
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'set'
6
+
7
+ # Convert CBETA XML P5a to simple HTML
8
+ #
9
+ # CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
10
+ #
11
+ # @example for convert 大正藏第一冊:
12
+ #
13
+ # c = CBETA::P5aToSimpleHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
14
+ # c.convert('T01')
15
+ #
16
+ class CBETA::P5aToSimpleHTML
17
+
18
+ # @param xml_root [String] 來源 CBETA XML P5a 路徑
19
+ # @param output_root [String] 輸出 Text 路徑
20
+ def initialize(xml_root, output_root)
21
+ @xml_root = xml_root
22
+ @output_root = output_root
23
+ @cbeta = CBETA.new
24
+ @gaijis = CBETA::Gaiji.new
25
+
26
+ # 載入 unicode 1.1 字集列表
27
+ fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
28
+ json = File.read(fn)
29
+ @unicode1 = JSON.parse(json)
30
+ end
31
+
32
+ # 將 CBETA XML P5a 轉為 Text
33
+ #
34
+ # @example for convert 大正藏第一冊:
35
+ #
36
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
37
+ # x2h.convert('T01')
38
+ #
39
+ # @example for convert 大正藏全部:
40
+ #
41
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
42
+ # x2h.convert('T')
43
+ #
44
+ # @example for convert 大正藏第五冊至第七冊:
45
+ #
46
+ # x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
47
+ # x2h.convert('T05..T07')
48
+ #
49
+ # T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
50
+ def convert(target=nil)
51
+ return convert_all if target.nil?
52
+
53
+ arg = target.upcase
54
+ if arg.size == 1
55
+ handle_collection(arg)
56
+ else
57
+ if arg.include? '..'
58
+ arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
59
+ handle_vols($1, $2)
60
+ }
61
+ else
62
+ handle_vol(arg)
63
+ end
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ def convert_all
70
+ Dir.foreach(@xml_root) { |c|
71
+ next unless c.match(/^[A-Z]$/)
72
+ handle_collection(c)
73
+ }
74
+ end
75
+
76
+ def handle_anchor(e)
77
+ if e.has_attribute?('type')
78
+ if e['type'] == 'circle'
79
+ return '◎'
80
+ end
81
+ end
82
+
83
+ ''
84
+ end
85
+
86
+ def handle_collection(c)
87
+ @series = c
88
+ puts 'handle_collection ' + c
89
+ folder = File.join(@xml_root, @series)
90
+ Dir.foreach(folder) { |vol|
91
+ next if ['.', '..', '.DS_Store'].include? vol
92
+ handle_vol(vol)
93
+ }
94
+ end
95
+
96
+ def handle_corr(e)
97
+ "<r w='【CBETA】'>%s</r>" % traverse(e)
98
+ end
99
+
100
+ def handle_g(e)
101
+ # if 有 <mapping type="unicode">
102
+ # 直接採用
103
+ # else if 有 <mapping type="normal_unicode">
104
+ # 採用 normal_unicode
105
+ # else if 有 normalized form
106
+ # 採用 normalized form
107
+ # else
108
+ # Unicode PUA
109
+ gid = e['ref'][1..-1]
110
+ g = @gaijis[gid]
111
+ abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
112
+ zzs = g['zzs']
113
+
114
+ if gid.start_with?('SD')
115
+ case gid
116
+ when 'SD-E35A'
117
+ return '('
118
+ when 'SD-E35B'
119
+ return ')'
120
+ else
121
+ return g['roman']
122
+ end
123
+ end
124
+
125
+ return g['roman'] if gid.start_with?('RJ')
126
+ return g['unicode-char'] if g.has_key?('unicode')
127
+ return g['normal_unicode'] if g.has_key?('normal_unicode')
128
+ return g['normal'] if g.has_key?('normal')
129
+
130
+ # Unicode PUA
131
+ [0xf0000 + gid[2..-1].to_i].pack 'U'
132
+ end
133
+
134
+ def handle_lb(e)
135
+ r = "<a id='lb#{e['n']}'/>"
136
+ unless @next_line_buf.empty?
137
+ r += @next_line_buf + "\n"
138
+ @next_line_buf = ''
139
+ end
140
+ r
141
+ end
142
+
143
+ def handle_lem(e)
144
+ r = ''
145
+ r = traverse(e)
146
+ w = e['wit'].scan(/【.*?】/)
147
+ @editions.merge w
148
+ w = w.join(' ')
149
+ "<r w='#{w}'>#{r}</r>"
150
+ end
151
+
152
+ def handle_milestone(e)
153
+ r = ''
154
+ if e['unit'] == 'juan'
155
+ @juan = e['n'].to_i
156
+ r += "<juan #{@juan}>"
157
+ end
158
+ r
159
+ end
160
+
161
+ def handle_node(e)
162
+ return '' if e.comment?
163
+ return handle_text(e) if e.text?
164
+ return '' if PASS.include?(e.name)
165
+ r = case e.name
166
+ when 'anchor' then handle_anchor(e)
167
+ when 'back' then ''
168
+ when 'corr' then handle_corr(e)
169
+ when 'foreign' then ''
170
+ when 'g' then handle_g(e)
171
+ when 'graphic' then ''
172
+ when 'lb' then handle_lb(e)
173
+ when 'lem' then handle_lem(e)
174
+ when 'mulu' then ''
175
+ when 'note' then handle_note(e)
176
+ when 'milestone' then handle_milestone(e)
177
+ when 'rdg' then handle_rdg(e)
178
+ when 'reg' then ''
179
+ when 'sic' then handle_sic(e)
180
+ when 'sg' then handle_sg(e)
181
+ when 'tt' then handle_tt(e)
182
+ when 't' then handle_t(e)
183
+ when 'teiHeader' then ''
184
+ else traverse(e)
185
+ end
186
+ r
187
+ end
188
+
189
+ def handle_note(e)
190
+ if e.has_attribute?('place') && e['place']=='inline'
191
+ r = traverse(e)
192
+ return "(#{r})"
193
+ end
194
+ ''
195
+ end
196
+
197
+ def handle_rdg(e)
198
+ r = traverse(e)
199
+ w = e['wit'].scan(/【.*?】/)
200
+ @editions.merge w
201
+ "<r w='#{e['wit']}'>#{r}</r>"
202
+ end
203
+
204
+ def handle_sg(e)
205
+ '(' + traverse(e) + ')'
206
+ end
207
+
208
+ def handle_sic(e)
209
+ "<r w='#{@orig}'>" + traverse(e) + "</r>"
210
+ end
211
+
212
+ def handle_sutra(xml_fn)
213
+ puts "convert sutra #{xml_fn}"
214
+ @dila_note = 0
215
+ @div_count = 0
216
+ @editions = Set.new ["【CBETA】"]
217
+ @in_l = false
218
+ @juan = 0
219
+ @lg_row_open = false
220
+ @mod_notes = Set.new
221
+ @next_line_buf = ''
222
+ @open_divs = []
223
+ @sutra_no = File.basename(xml_fn, ".xml")
224
+
225
+ text = parse_xml(xml_fn)
226
+
227
+ # 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c....
228
+ # 輸出時去掉這些檔尾的 a, b, b....
229
+ if @sutra_no.match(/^(T05|T06|T07)n0220/)
230
+ @sutra_no = "#{$1}n0220"
231
+ end
232
+
233
+ @out_sutra = File.join(@out_vol, @sutra_no)
234
+ FileUtils.makedirs @out_sutra
235
+
236
+ juans = text.split(/(<juan \d+>)/)
237
+ open = false
238
+ fo = nil
239
+ juan_no = nil
240
+ fn = ''
241
+ buf = ''
242
+ # 一卷一檔
243
+ juans.each { |j|
244
+ if j =~ /<juan (\d+)>$/
245
+ juan_no = $1.to_i
246
+ else
247
+ if juan_no.nil?
248
+ buf = j
249
+ else
250
+ write_juan(juan_no, buf+j)
251
+ buf = ''
252
+ end
253
+ end
254
+ }
255
+ end
256
+
257
+ def handle_t(e)
258
+ if e.has_attribute? 'place'
259
+ return '' if e['place'].include? 'foot'
260
+ end
261
+ r = traverse(e)
262
+
263
+ # 不是雙行對照
264
+ return r if @tt_type == 'app'
265
+
266
+ # 處理雙行對照
267
+ i = e.xpath('../t').index(e)
268
+ case i
269
+ when 0
270
+ return r + ' '
271
+ when 1
272
+ @next_line_buf += r + ' '
273
+ return ''
274
+ else
275
+ return r
276
+ end
277
+ end
278
+
279
+ def handle_text(e)
280
+ s = e.content().chomp
281
+ return '' if s.empty?
282
+ return '' if e.parent.name == 'app'
283
+
284
+ # cbeta xml 文字之間會有多餘的換行
285
+ r = s.gsub(/[\n\r]/, '')
286
+
287
+ # 把 & 轉為 &amp;
288
+ CGI.escapeHTML(r)
289
+ end
290
+
291
+ def handle_tt(e)
292
+ @tt_type = e['type']
293
+ traverse(e)
294
+ end
295
+
296
+ def handle_vol(vol)
297
+ puts "convert volumn: #{vol}"
298
+
299
+ @orig = @cbeta.get_canon_abbr(vol[0])
300
+ abort "未處理底本" if @orig.nil?
301
+
302
+ @vol = vol
303
+ @series = vol[0]
304
+ @out_vol = File.join(@output_root, @series, vol)
305
+ FileUtils.remove_dir(@out_vol, force=true)
306
+ FileUtils.makedirs @out_vol
307
+
308
+ source = File.join(@xml_root, @series, vol)
309
+ Dir[source+"/*"].each { |f|
310
+ handle_sutra(f)
311
+ }
312
+ end
313
+
314
+ def handle_vols(v1, v2)
315
+ puts "convert volumns: #{v1}..#{v2}"
316
+ @series = v1[0]
317
+ folder = File.join(@xml_root, @series)
318
+ Dir.foreach(folder) { |vol|
319
+ next if vol < v1
320
+ next if vol > v2
321
+ handle_vol(vol)
322
+ }
323
+ end
324
+
325
+ def open_xml(fn)
326
+ s = File.read(fn)
327
+ doc = Nokogiri::XML(s)
328
+ doc.remove_namespaces!()
329
+ doc
330
+ end
331
+
332
+ def parse_xml(xml_fn)
333
+ doc = open_xml(xml_fn)
334
+ root = doc.root()
335
+
336
+ body = root.xpath("text/body")[0]
337
+ traverse(body)
338
+ end
339
+
340
+ def traverse(e)
341
+ r = ''
342
+ e.children.each { |c|
343
+ s = handle_node(c)
344
+ r += s
345
+ }
346
+ r
347
+ end
348
+
349
+ def write_juan(juan_no, txt)
350
+ folder = File.join(@out_sutra, "%03d" % juan_no)
351
+ FileUtils.makedirs(folder)
352
+ @editions.each do |ed|
353
+ frag = Nokogiri::XML.fragment(txt)
354
+ frag.search("r").each do |node|
355
+ if node['w'] == ed
356
+ node.add_previous_sibling(node.text)
357
+ end
358
+ node.remove
359
+ end
360
+
361
+ text = <<-END.gsub(/^\s+\|/, '')
362
+ |<!DOCTYPE html>
363
+ |<html>
364
+ |<head>
365
+ | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
366
+ |</head>
367
+ |<body>
368
+ END
369
+ text += to_html(frag) + "</body></html>"
370
+
371
+ fn = ed.sub(/^【(.*?)】$/, '\1')
372
+ fn = "#{fn}.html"
373
+ output_path = File.join(folder, fn)
374
+ File.write(output_path, text)
375
+ end
376
+ end
377
+
378
+ def to_html(e)
379
+ e.to_xml(encoding: 'UTF-8', :save_with => Nokogiri::XML::Node::SaveOptions::AS_XML)
380
+ end
381
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-12 00:00:00.000000000 Z
11
+ date: 2015-08-13 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
14
14
  email: zhoubx@gmail.com
@@ -23,6 +23,7 @@ files:
23
23
  - lib/cbeta/gaiji.rb
24
24
  - lib/cbeta/html_to_text.rb
25
25
  - lib/cbeta/p5a_to_html.rb
26
+ - lib/cbeta/p5a_to_simple_html.rb
26
27
  - lib/cbeta/p5a_to_text.rb
27
28
  - lib/cbeta/unicode-1.1.json
28
29
  homepage: https://github.com/RayCHOU/ruby-cbeta