cbeta 2.2.6 → 2.2.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +38 -5
- data/lib/cbeta/gaiji.rb +0 -1
- data/lib/cbeta/html_to_text.rb +1 -1
- data/lib/cbeta/p5a_to_html.rb +8 -16
- data/lib/cbeta/p5a_to_html_for_every_edition.rb +10 -21
- data/lib/cbeta/p5a_to_html_for_pdf.rb +3 -10
- data/lib/cbeta/p5a_to_simple_html.rb +3 -7
- data/lib/cbeta/p5a_to_text.rb +116 -120
- data/lib/cbeta/p5a_validator.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a3f8edd88730de817c9ecec347b516f552cd148a
|
4
|
+
data.tar.gz: 3f9b2eb3e8010094ce3ce47b84e261319d8ccac5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7e05cadab483733f7a5c531966fca41fcd453e1da9ae44f64433bf04e72d3ae9b129ea7b4dc0f94c3e7f304d9d5755ec192a572cd463a6528d3088ed4bcab5f0
|
7
|
+
data.tar.gz: 15f31074085847f332c3633f7de85706167c059709ae48a889081b9ebbe26f9e32cf626a07c8492cb29c6aadd2089b11e2dae16a66406d8c44038961047a55eb
|
data/lib/cbeta.rb
CHANGED
@@ -6,8 +6,41 @@
|
|
6
6
|
require 'csv'
|
7
7
|
|
8
8
|
class CBETA
|
9
|
+
CANON = 'DA|GA|GB|[A-Z]'
|
9
10
|
DATA = File.join(File.dirname(__FILE__), 'data')
|
10
11
|
PUNCS = '.[]。,、?「」『』《》<>〈〉〔〕[]【】〖〗'
|
12
|
+
|
13
|
+
# 由 行首資訊 取得 藏經 ID
|
14
|
+
# @param linehead[String] 行首資訊, 例如 "T01n0001_p0001a01" 或 "GA009n0008_p0003a01"
|
15
|
+
# @return [String] 藏經 ID,例如 "T" 或 "GA"
|
16
|
+
def self.get_canon_id_from_linehead(linehead)
|
17
|
+
linehead.sub(/^(#{CANON}).*$/, '\1')
|
18
|
+
end
|
19
|
+
|
20
|
+
# 由 冊號 取得 藏經 ID
|
21
|
+
# @param vol[String] 冊號, 例如 "T01" 或 "GA009"
|
22
|
+
# @return [String] 藏經 ID,例如 "T" 或 "GA"
|
23
|
+
def self.get_canon_from_vol(vol)
|
24
|
+
vol.sub(/^(#{CANON}).*$/, '\1')
|
25
|
+
end
|
26
|
+
|
27
|
+
# 由 行首資訊 取得 XML檔相對路徑
|
28
|
+
# @param linehead[String] 行首資訊, 例如 "GA009n0008_p0003a01"
|
29
|
+
# @return [String] XML檔相對路徑,例如 "GA/GA009/GA009n0008.xml"
|
30
|
+
def self.linehead_to_xml_file_path(linehead)
|
31
|
+
if m = linehead.match(/^(?<work>(?<vol>(?<canon>#{CANON})\d+)n\d+[a-zA-Z]?).*$/)
|
32
|
+
File.join(m[:canon], m[:vol], m[:work]+'.xml')
|
33
|
+
else
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# 由 XML檔主檔名 取得 典籍編號
|
39
|
+
# @param fn[String] 檔名, 例如 "T01n0001" 或 "GA009n0008"
|
40
|
+
# @return [String] 典籍編號,例如 "T0001" 或 "GA0008"
|
41
|
+
def self.get_work_id_from_file_basename(fn)
|
42
|
+
fn.sub(/^(#{CANON})\d{2,3}n(.*)$/, '\1\2')
|
43
|
+
end
|
11
44
|
|
12
45
|
# 將行首資訊轉為引用格式
|
13
46
|
#
|
@@ -18,7 +51,7 @@ class CBETA
|
|
18
51
|
# CBETA.linehead_to_s('T85n2838_p1291a03')
|
19
52
|
# # return "T85, no. 2838, p. 1291, a03"
|
20
53
|
def self.linehead_to_s(linehead)
|
21
|
-
linehead.match(/^(
|
54
|
+
linehead.match(/^((?:#{CANON})\d+)n(.*)_p(\d+)([a-z]\d+)$/) {
|
22
55
|
return "#{$1}, no. #{$2}, p. #{$3}, #{$4}"
|
23
56
|
}
|
24
57
|
nil
|
@@ -68,12 +101,12 @@ class CBETA
|
|
68
101
|
s = File.read(fn)
|
69
102
|
@categories = JSON.parse(s)
|
70
103
|
end
|
71
|
-
|
104
|
+
|
72
105
|
# @param id [String] 藏經 ID, 例如大正藏的 ID 是 "T"
|
73
106
|
# @return [String] 藏經短名,例如 "大正藏"
|
74
|
-
|
75
|
-
|
76
|
-
|
107
|
+
def get_canon_nickname(id)
|
108
|
+
return nil unless @canon_nickname.key? id
|
109
|
+
@canon_nickname[id]
|
77
110
|
end
|
78
111
|
|
79
112
|
# 取得藏經略符
|
data/lib/cbeta/gaiji.rb
CHANGED
data/lib/cbeta/html_to_text.rb
CHANGED
data/lib/cbeta/p5a_to_html.rb
CHANGED
@@ -540,10 +540,7 @@ class CBETA::P5aToHTML
|
|
540
540
|
text.gsub!(/(<a class='noteAnchor'[^>]*><\/a>)(<div class="lg-cell"[^>]*>)/, '\2\1')
|
541
541
|
|
542
542
|
juans = text.split(/(<juan \d+>)/)
|
543
|
-
open = false
|
544
|
-
fo = nil
|
545
543
|
juan_no = nil
|
546
|
-
fn = ''
|
547
544
|
buf = ''
|
548
545
|
# 一卷一檔
|
549
546
|
juans.each { |j|
|
@@ -618,9 +615,9 @@ class CBETA::P5aToHTML
|
|
618
615
|
abort "未處理底本" if @orig.nil?
|
619
616
|
|
620
617
|
@vol = vol
|
621
|
-
@series = vol
|
618
|
+
@series = CBETA.get_canon_from_vol(vol)
|
622
619
|
@out_folder = File.join(@out_root, @series, vol)
|
623
|
-
FileUtils.remove_dir(@out_folder,
|
620
|
+
FileUtils.remove_dir(@out_folder, true)
|
624
621
|
FileUtils::mkdir_p @out_folder
|
625
622
|
|
626
623
|
source = File.join(@xml_root, @series, vol)
|
@@ -631,7 +628,7 @@ class CBETA::P5aToHTML
|
|
631
628
|
|
632
629
|
def handle_vols(v1, v2)
|
633
630
|
puts "convert volumns: #{v1}..#{v2}"
|
634
|
-
@series = v1
|
631
|
+
@series = CBETA.get_canon_from_vol(v1)
|
635
632
|
folder = File.join(@xml_root, @series)
|
636
633
|
Dir.foreach(folder) { |vol|
|
637
634
|
next if vol < v1
|
@@ -677,16 +674,11 @@ class CBETA::P5aToHTML
|
|
677
674
|
end
|
678
675
|
|
679
676
|
def linehead_exist_in_cbeta(s)
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
path = File.join(@xml_root, corpus, vol, sutra+'.xml')
|
686
|
-
return File.exist? path
|
687
|
-
else
|
688
|
-
return false
|
689
|
-
end
|
677
|
+
fn = CBETA.linehead_to_xml_file_path(s)
|
678
|
+
return false if fn.nil?
|
679
|
+
|
680
|
+
path = File.join(@xml_root, fn)
|
681
|
+
File.exist? path
|
690
682
|
end
|
691
683
|
|
692
684
|
def open_xml(fn)
|
@@ -106,10 +106,7 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
106
106
|
text.gsub!(/(<a class='noteAnchor'[^>]*><\/a>)(<div class="lg-cell"[^>]*>)/, '\2\1')
|
107
107
|
|
108
108
|
juans = text.split(/(<juan \d+>)/)
|
109
|
-
open = false
|
110
|
-
fo = nil
|
111
109
|
juan_no = nil
|
112
|
-
fn = ''
|
113
110
|
buf = ''
|
114
111
|
# 一卷一檔
|
115
112
|
juans.each { |j|
|
@@ -122,8 +119,7 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
122
119
|
buf = ''
|
123
120
|
end
|
124
121
|
}
|
125
|
-
end
|
126
|
-
|
122
|
+
end
|
127
123
|
|
128
124
|
def convert_vol(vol)
|
129
125
|
puts "convert volumn: #{vol}"
|
@@ -414,12 +410,12 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
414
410
|
def handle_lem(e)
|
415
411
|
r = ''
|
416
412
|
content = traverse(e)
|
417
|
-
|
418
|
-
if
|
413
|
+
wit = e['wit']
|
414
|
+
if wit.include? 'CBETA' and not wit.include? @orig
|
419
415
|
n = @notes_dila[@juan].size + 1
|
420
416
|
r = "<a class='noteAnchor dila' href='#dila_note#{n}'></a>"
|
421
417
|
r += "<span class='cbeta'>%s</span>" % content
|
422
|
-
r = "<r w='#{
|
418
|
+
r = "<r w='#{wit}' l='#{@lb}'>#{r}</r>"
|
423
419
|
|
424
420
|
note = lem_note_cf(e)
|
425
421
|
note += lem_note_rdg(e)
|
@@ -625,7 +621,6 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
625
621
|
|
626
622
|
def handle_rdg(e)
|
627
623
|
r = traverse(e)
|
628
|
-
w = e['wit'].scan(/【.*?】/)
|
629
624
|
"<r w='#{e['wit']}' l='#{@lb}' w='#{@char_count}'>#{r}</r>"
|
630
625
|
end
|
631
626
|
|
@@ -788,16 +783,11 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
788
783
|
end
|
789
784
|
|
790
785
|
def linehead_exist_in_cbeta(s)
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
path = File.join(@xml_root, corpus, vol, sutra+'.xml')
|
797
|
-
return File.exist? path
|
798
|
-
else
|
799
|
-
return false
|
800
|
-
end
|
786
|
+
fn = CBETA.linehead_to_xml_file_path(s)
|
787
|
+
return false if fn.nil?
|
788
|
+
|
789
|
+
path = File.join(@xml_root, fn)
|
790
|
+
File.exist? path
|
801
791
|
end
|
802
792
|
|
803
793
|
def open_xml(fn)
|
@@ -872,10 +862,9 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
872
862
|
else
|
873
863
|
work = @sutra_no.sub(/^([A-Z]{1,2})\d{2,3}n(.*)$/, '\1\2')
|
874
864
|
end
|
875
|
-
canon = work[0]
|
876
865
|
juan = "%03d" % juan_no
|
877
866
|
folder = File.join(@out_folder, work, juan)
|
878
|
-
FileUtils.remove_dir(folder,
|
867
|
+
FileUtils.remove_dir(folder, true)
|
879
868
|
FileUtils.makedirs folder
|
880
869
|
|
881
870
|
@editions.each do |ed|
|
@@ -198,8 +198,6 @@ class CBETA::P5aToHTMLForPDF
|
|
198
198
|
end
|
199
199
|
|
200
200
|
def handle_anchor(e)
|
201
|
-
id = e['id']
|
202
|
-
|
203
201
|
if e.has_attribute?('type')
|
204
202
|
if e['type'] == 'circle'
|
205
203
|
return '◎'
|
@@ -496,7 +494,6 @@ class CBETA::P5aToHTMLForPDF
|
|
496
494
|
end
|
497
495
|
|
498
496
|
def handle_note(e)
|
499
|
-
n = e['n']
|
500
497
|
if e.has_attribute?('type')
|
501
498
|
t = e['type']
|
502
499
|
if %w(equivalent orig orig_biao orig_ke mod rest).include? t
|
@@ -638,13 +635,9 @@ class CBETA::P5aToHTMLForPDF
|
|
638
635
|
abort "未處理底本" if @orig.nil?
|
639
636
|
|
640
637
|
@vol = vol
|
641
|
-
|
642
|
-
@series = 'DA'
|
643
|
-
else
|
644
|
-
@series = vol[0]
|
645
|
-
end
|
638
|
+
@series = CBETA.get_canon_from_vol(vol)
|
646
639
|
@out_folder = File.join(@out_root, @series, vol)
|
647
|
-
FileUtils.remove_dir(@out_folder,
|
640
|
+
FileUtils.remove_dir(@out_folder, true)
|
648
641
|
FileUtils::mkdir_p @out_folder
|
649
642
|
|
650
643
|
source = File.join(@xml_root, @series, vol)
|
@@ -657,7 +650,7 @@ class CBETA::P5aToHTMLForPDF
|
|
657
650
|
|
658
651
|
def handle_vols(v1, v2)
|
659
652
|
puts "convert volumns: #{v1}..#{v2}"
|
660
|
-
@series = v1
|
653
|
+
@series = CBETA.get_canon_from_vol(v1)
|
661
654
|
folder = File.join(@xml_root, @series)
|
662
655
|
Dir.foreach(folder) { |vol|
|
663
656
|
next if vol < v1
|
@@ -119,7 +119,6 @@ class CBETA::P5aToSimpleHTML
|
|
119
119
|
gid = e['ref'][1..-1]
|
120
120
|
g = @gaijis[gid]
|
121
121
|
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
122
|
-
zzs = g['zzs']
|
123
122
|
|
124
123
|
if gid.start_with?('SD') # 悉曇字
|
125
124
|
case gid
|
@@ -257,10 +256,7 @@ class CBETA::P5aToSimpleHTML
|
|
257
256
|
FileUtils.makedirs @out_sutra
|
258
257
|
|
259
258
|
juans = text.split(/(<juan \d+>)/)
|
260
|
-
open = false
|
261
|
-
fo = nil
|
262
259
|
juan_no = nil
|
263
|
-
fn = ''
|
264
260
|
buf = ''
|
265
261
|
# 一卷一檔
|
266
262
|
juans.each { |j|
|
@@ -324,9 +320,9 @@ class CBETA::P5aToSimpleHTML
|
|
324
320
|
@orig_short = @orig.sub(/^【(.*)】$/, '\1')
|
325
321
|
|
326
322
|
@vol = vol
|
327
|
-
@series = vol
|
323
|
+
@series = CBETA.get_canon_from_vol(vol)
|
328
324
|
@out_vol = File.join(@output_root, @series, vol)
|
329
|
-
FileUtils.remove_dir(@out_vol,
|
325
|
+
FileUtils.remove_dir(@out_vol, true)
|
330
326
|
FileUtils.makedirs @out_vol
|
331
327
|
|
332
328
|
source = File.join(@xml_root, @series, vol)
|
@@ -337,7 +333,7 @@ class CBETA::P5aToSimpleHTML
|
|
337
333
|
|
338
334
|
def handle_vols(v1, v2)
|
339
335
|
puts "convert volumns: #{v1}..#{v2}"
|
340
|
-
@series = v1
|
336
|
+
@series = CBETA.get_canon_from_vol(v1)
|
341
337
|
folder = File.join(@xml_root, @series)
|
342
338
|
Dir.foreach(folder) { |vol|
|
343
339
|
next if vol < v1
|
data/lib/cbeta/p5a_to_text.rb
CHANGED
@@ -70,8 +70,8 @@ class CBETA::P5aToText
|
|
70
70
|
return convert_all if target.nil?
|
71
71
|
|
72
72
|
arg = target.upcase
|
73
|
-
if arg.size
|
74
|
-
|
73
|
+
if arg.size <= 2
|
74
|
+
handle_canon(arg)
|
75
75
|
else
|
76
76
|
if arg.include? '..'
|
77
77
|
arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
|
@@ -124,7 +124,7 @@ class CBETA::P5aToText
|
|
124
124
|
def convert_all
|
125
125
|
Dir.entries(@xml_root).sort.each do |c|
|
126
126
|
next unless c.match(/^[A-Z]$/)
|
127
|
-
|
127
|
+
handle_canon(c)
|
128
128
|
end
|
129
129
|
end
|
130
130
|
|
@@ -138,7 +138,7 @@ class CBETA::P5aToText
|
|
138
138
|
r
|
139
139
|
end
|
140
140
|
|
141
|
-
def
|
141
|
+
def e_anchor(e)
|
142
142
|
if e.has_attribute?('type')
|
143
143
|
if e['type'] == 'circle'
|
144
144
|
return '◎'
|
@@ -148,53 +148,43 @@ class CBETA::P5aToText
|
|
148
148
|
''
|
149
149
|
end
|
150
150
|
|
151
|
-
def
|
151
|
+
def e_app(e)
|
152
152
|
traverse(e)
|
153
153
|
end
|
154
154
|
|
155
|
-
def
|
155
|
+
def e_byline(e)
|
156
156
|
r = traverse(e)
|
157
157
|
r += @settings[:format]=='app' ? "\t" : "\n"
|
158
158
|
r
|
159
159
|
end
|
160
160
|
|
161
|
-
def
|
161
|
+
def e_cell(e)
|
162
162
|
r = traverse(e)
|
163
163
|
r += @settings[:format]=='app' ? "\t" : "\n"
|
164
164
|
r
|
165
165
|
end
|
166
166
|
|
167
|
-
def
|
168
|
-
@series = c
|
169
|
-
puts 'handle_collection ' + c
|
170
|
-
folder = File.join(@xml_root, @series)
|
171
|
-
Dir.entries(folder).sort.each do |vol|
|
172
|
-
next if vol.start_with? '.'
|
173
|
-
handle_vol(vol)
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
def handle_corr(e)
|
167
|
+
def e_corr(e)
|
178
168
|
"<r w='【CBETA】'>%s</r>" % traverse(e)
|
179
169
|
end
|
180
170
|
|
181
|
-
def
|
171
|
+
def e_div(e)
|
182
172
|
traverse(e)
|
183
173
|
end
|
184
174
|
|
185
|
-
def
|
175
|
+
def e_docNumber(e)
|
186
176
|
r = traverse(e)
|
187
177
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
188
178
|
r
|
189
179
|
end
|
190
180
|
|
191
|
-
def
|
181
|
+
def e_figure(e)
|
192
182
|
r = traverse(e)
|
193
183
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
194
184
|
r
|
195
185
|
end
|
196
186
|
|
197
|
-
def
|
187
|
+
def e_g(e)
|
198
188
|
# if 悉曇字、蘭札體
|
199
189
|
# 使用 Unicode PUA
|
200
190
|
# else if 有 <mapping type="unicode">
|
@@ -215,7 +205,6 @@ class CBETA::P5aToText
|
|
215
205
|
|
216
206
|
g = @gaijis[gid]
|
217
207
|
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
218
|
-
zzs = g['zzs']
|
219
208
|
|
220
209
|
if gid.start_with?('SD') # 悉曇字
|
221
210
|
case gid
|
@@ -240,28 +229,28 @@ class CBETA::P5aToText
|
|
240
229
|
[0xf0000 + gid[2..-1].to_i].pack 'U'
|
241
230
|
end
|
242
231
|
|
243
|
-
def
|
232
|
+
def e_graphic(e)
|
244
233
|
''
|
245
234
|
end
|
246
235
|
|
247
|
-
def
|
236
|
+
def e_head(e)
|
248
237
|
r = traverse(e)
|
249
238
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
250
239
|
r
|
251
240
|
end
|
252
241
|
|
253
|
-
def
|
242
|
+
def e_item(e)
|
254
243
|
r = traverse(e)
|
255
244
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
256
245
|
end
|
257
246
|
|
258
|
-
def
|
247
|
+
def e_juan(e)
|
259
248
|
r = traverse(e)
|
260
249
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
261
250
|
r
|
262
251
|
end
|
263
252
|
|
264
|
-
def
|
253
|
+
def e_l(e)
|
265
254
|
r = traverse(e)
|
266
255
|
if @settings[:format] == 'app'
|
267
256
|
r += "\t"
|
@@ -271,7 +260,7 @@ class CBETA::P5aToText
|
|
271
260
|
r
|
272
261
|
end
|
273
262
|
|
274
|
-
def
|
263
|
+
def e_lb(e)
|
275
264
|
r = ''
|
276
265
|
if @settings[:format] == 'app'
|
277
266
|
r += "\n#{e['n']}║"
|
@@ -283,7 +272,7 @@ class CBETA::P5aToText
|
|
283
272
|
r
|
284
273
|
end
|
285
274
|
|
286
|
-
def
|
275
|
+
def e_lem(e)
|
287
276
|
# 沒有 rdg 的版本,用字同 lem
|
288
277
|
editions = Set.new @editions
|
289
278
|
e.xpath('./following-sibling::rdg').each do |rdg|
|
@@ -296,17 +285,17 @@ class CBETA::P5aToText
|
|
296
285
|
"<r w='#{w}'>%s</r>" % traverse(e)
|
297
286
|
end
|
298
287
|
|
299
|
-
def
|
288
|
+
def e_lg(e)
|
300
289
|
traverse(e)
|
301
290
|
end
|
302
291
|
|
303
|
-
def
|
292
|
+
def e_list(e)
|
304
293
|
r = ''
|
305
294
|
r += "\n" unless @settings[:format] == 'app'
|
306
295
|
r + traverse(e)
|
307
296
|
end
|
308
297
|
|
309
|
-
def
|
298
|
+
def e_milestone(e)
|
310
299
|
r = ''
|
311
300
|
if e['unit'] == 'juan'
|
312
301
|
@juan = e['n'].to_i
|
@@ -315,55 +304,11 @@ class CBETA::P5aToText
|
|
315
304
|
r
|
316
305
|
end
|
317
306
|
|
318
|
-
def
|
307
|
+
def e_mulu(e)
|
319
308
|
''
|
320
309
|
end
|
321
310
|
|
322
|
-
def
|
323
|
-
return '' if e.comment?
|
324
|
-
return handle_text(e) if e.text?
|
325
|
-
return '' if PASS.include?(e.name)
|
326
|
-
r = case e.name
|
327
|
-
when 'anchor' then handle_anchor(e)
|
328
|
-
when 'app' then handle_app(e)
|
329
|
-
when 'back' then ''
|
330
|
-
when 'byline' then handle_byline(e)
|
331
|
-
when 'cell' then handle_cell(e)
|
332
|
-
when 'corr' then handle_corr(e)
|
333
|
-
when 'div' then handle_div(e)
|
334
|
-
when 'docNumber' then handle_docNumber(e)
|
335
|
-
when 'figure' then handle_figure(e)
|
336
|
-
when 'foreign' then ''
|
337
|
-
when 'g' then handle_g(e)
|
338
|
-
when 'graphic' then handle_graphic(e)
|
339
|
-
when 'head' then handle_head(e)
|
340
|
-
when 'item' then handle_item(e)
|
341
|
-
when 'juan' then handle_juan(e)
|
342
|
-
when 'l' then handle_l(e)
|
343
|
-
when 'lb' then handle_lb(e)
|
344
|
-
when 'lem' then handle_lem(e)
|
345
|
-
when 'lg' then handle_lg(e)
|
346
|
-
when 'list' then handle_list(e)
|
347
|
-
when 'mulu' then handle_mulu(e)
|
348
|
-
when 'note' then handle_note(e)
|
349
|
-
when 'milestone' then handle_milestone(e)
|
350
|
-
when 'p' then handle_p(e)
|
351
|
-
when 'rdg' then handle_rdg(e)
|
352
|
-
when 'reg' then ''
|
353
|
-
when 'row' then handle_row(e)
|
354
|
-
when 'sic' then handle_sic(e)
|
355
|
-
when 'sg' then handle_sg(e)
|
356
|
-
when 'tt' then handle_tt(e)
|
357
|
-
when 't' then handle_t(e)
|
358
|
-
when 'table' then handle_table(e)
|
359
|
-
when 'teiHeader' then ''
|
360
|
-
when 'unclear' then '▆'
|
361
|
-
else traverse(e)
|
362
|
-
end
|
363
|
-
r
|
364
|
-
end
|
365
|
-
|
366
|
-
def handle_note(e)
|
311
|
+
def e_note(e)
|
367
312
|
if e.has_attribute?('place') && e['place']=='inline'
|
368
313
|
r = traverse(e)
|
369
314
|
return "(#{r})"
|
@@ -371,28 +316,108 @@ class CBETA::P5aToText
|
|
371
316
|
''
|
372
317
|
end
|
373
318
|
|
374
|
-
def
|
319
|
+
def e_p(e)
|
375
320
|
r = traverse(e)
|
376
321
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
377
322
|
r
|
378
323
|
end
|
379
324
|
|
380
|
-
def
|
325
|
+
def e_rdg(e)
|
381
326
|
"<r w='#{e['wit']}'>%s</r>" % traverse(e)
|
382
327
|
end
|
383
328
|
|
384
|
-
def
|
329
|
+
def e_row(e)
|
385
330
|
traverse(e)
|
386
331
|
end
|
387
332
|
|
388
|
-
def
|
333
|
+
def e_sg(e)
|
389
334
|
'(' + traverse(e) + ')'
|
390
335
|
end
|
391
336
|
|
392
|
-
def
|
337
|
+
def e_sic(e)
|
393
338
|
"<r w='#{@orig}'>" + traverse(e) + "</r>"
|
394
339
|
end
|
395
340
|
|
341
|
+
def e_t(e)
|
342
|
+
if e.has_attribute? 'place'
|
343
|
+
return '' if e['place'].include? 'foot'
|
344
|
+
end
|
345
|
+
r = traverse(e)
|
346
|
+
|
347
|
+
# 不是雙行對照
|
348
|
+
return r if @tt_type == 'app'
|
349
|
+
|
350
|
+
# 處理雙行對照
|
351
|
+
i = e.xpath('../t').index(e)
|
352
|
+
case i
|
353
|
+
when 0
|
354
|
+
return r + ' '
|
355
|
+
when 1
|
356
|
+
@next_line_buf += r + ' '
|
357
|
+
return ''
|
358
|
+
else
|
359
|
+
return r
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
363
|
+
def e_table(e)
|
364
|
+
traverse(e)
|
365
|
+
end
|
366
|
+
|
367
|
+
def handle_canon(c)
|
368
|
+
@canon = c
|
369
|
+
puts 'handle_canon ' + c
|
370
|
+
folder = File.join(@xml_root, @canon)
|
371
|
+
Dir.entries(folder).sort.each do |vol|
|
372
|
+
next if vol.start_with? '.'
|
373
|
+
handle_vol(vol)
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
def handle_node(e)
|
378
|
+
return '' if e.comment?
|
379
|
+
return handle_text(e) if e.text?
|
380
|
+
return '' if PASS.include?(e.name)
|
381
|
+
r = case e.name
|
382
|
+
when 'anchor' then e_anchor(e)
|
383
|
+
when 'app' then e_app(e)
|
384
|
+
when 'back' then ''
|
385
|
+
when 'byline' then e_byline(e)
|
386
|
+
when 'cell' then e_cell(e)
|
387
|
+
when 'corr' then e_corr(e)
|
388
|
+
when 'div' then e_div(e)
|
389
|
+
when 'docNumber' then e_docNumber(e)
|
390
|
+
when 'figure' then e_figure(e)
|
391
|
+
when 'foreign' then ''
|
392
|
+
when 'g' then e_g(e)
|
393
|
+
when 'graphic' then e_graphic(e)
|
394
|
+
when 'head' then e_head(e)
|
395
|
+
when 'item' then e_item(e)
|
396
|
+
when 'juan' then e_juan(e)
|
397
|
+
when 'l' then e_l(e)
|
398
|
+
when 'lb' then e_lb(e)
|
399
|
+
when 'lem' then e_lem(e)
|
400
|
+
when 'lg' then e_lg(e)
|
401
|
+
when 'list' then e_list(e)
|
402
|
+
when 'mulu' then e_mulu(e)
|
403
|
+
when 'note' then e_note(e)
|
404
|
+
when 'milestone' then e_milestone(e)
|
405
|
+
when 'p' then e_p(e)
|
406
|
+
when 'rdg' then e_rdg(e)
|
407
|
+
when 'reg' then ''
|
408
|
+
when 'row' then e_row(e)
|
409
|
+
when 'sic' then e_sic(e)
|
410
|
+
when 'sg' then e_sg(e)
|
411
|
+
when 'tt' then e_tt(e)
|
412
|
+
when 't' then e_t(e)
|
413
|
+
when 'table' then e_table(e)
|
414
|
+
when 'teiHeader' then ''
|
415
|
+
when 'unclear' then '▆'
|
416
|
+
else traverse(e)
|
417
|
+
end
|
418
|
+
r
|
419
|
+
end
|
420
|
+
|
396
421
|
def handle_sutra(xml_fn)
|
397
422
|
puts "convert sutra #{xml_fn}"
|
398
423
|
@dila_note = 0
|
@@ -418,10 +443,7 @@ class CBETA::P5aToText
|
|
418
443
|
FileUtils.makedirs @out_sutra
|
419
444
|
|
420
445
|
juans = text.split(/(<juan \d+>)/)
|
421
|
-
open = false
|
422
|
-
fo = nil
|
423
446
|
juan_no = nil
|
424
|
-
fn = ''
|
425
447
|
buf = ''
|
426
448
|
# 一卷一檔
|
427
449
|
juans.each { |j|
|
@@ -438,32 +460,6 @@ class CBETA::P5aToText
|
|
438
460
|
}
|
439
461
|
end
|
440
462
|
|
441
|
-
def handle_t(e)
|
442
|
-
if e.has_attribute? 'place'
|
443
|
-
return '' if e['place'].include? 'foot'
|
444
|
-
end
|
445
|
-
r = traverse(e)
|
446
|
-
|
447
|
-
# 不是雙行對照
|
448
|
-
return r if @tt_type == 'app'
|
449
|
-
|
450
|
-
# 處理雙行對照
|
451
|
-
i = e.xpath('../t').index(e)
|
452
|
-
case i
|
453
|
-
when 0
|
454
|
-
return r + ' '
|
455
|
-
when 1
|
456
|
-
@next_line_buf += r + ' '
|
457
|
-
return ''
|
458
|
-
else
|
459
|
-
return r
|
460
|
-
end
|
461
|
-
end
|
462
|
-
|
463
|
-
def handle_table(e)
|
464
|
-
traverse(e)
|
465
|
-
end
|
466
|
-
|
467
463
|
def handle_text(e)
|
468
464
|
s = e.content().chomp
|
469
465
|
return '' if s.empty?
|
@@ -476,7 +472,7 @@ class CBETA::P5aToText
|
|
476
472
|
CGI.escapeHTML(r)
|
477
473
|
end
|
478
474
|
|
479
|
-
def
|
475
|
+
def e_tt(e)
|
480
476
|
@tt_type = e['type']
|
481
477
|
traverse(e)
|
482
478
|
end
|
@@ -488,12 +484,12 @@ class CBETA::P5aToText
|
|
488
484
|
abort "未處理底本" if @orig.nil?
|
489
485
|
|
490
486
|
@vol = vol
|
491
|
-
@
|
492
|
-
@out_vol = File.join(@output_root, @
|
493
|
-
FileUtils.remove_dir(@out_vol,
|
487
|
+
@canon = CBETA.get_canon_from_vol(vol)
|
488
|
+
@out_vol = File.join(@output_root, @canon, vol)
|
489
|
+
FileUtils.remove_dir(@out_vol, true)
|
494
490
|
FileUtils.makedirs @out_vol
|
495
491
|
|
496
|
-
source = File.join(@xml_root, @
|
492
|
+
source = File.join(@xml_root, @canon, vol)
|
497
493
|
Dir.entries(source).sort.each { |f|
|
498
494
|
next if f.start_with? '.'
|
499
495
|
fn = File.join(source, f)
|
@@ -503,8 +499,8 @@ class CBETA::P5aToText
|
|
503
499
|
|
504
500
|
def handle_vols(v1, v2)
|
505
501
|
puts "convert volumns: #{v1}..#{v2}"
|
506
|
-
@
|
507
|
-
folder = File.join(@xml_root, @
|
502
|
+
@canon = get_canon_from_vol(v1)
|
503
|
+
folder = File.join(@xml_root, @canon)
|
508
504
|
Dir.entries(folder).sort.each do |vol|
|
509
505
|
next if vol < v1
|
510
506
|
next if vol > v2
|
data/lib/cbeta/p5a_validator.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.2.
|
4
|
+
version: 2.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|