cbeta 2.2.6 → 2.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +38 -5
- data/lib/cbeta/gaiji.rb +0 -1
- data/lib/cbeta/html_to_text.rb +1 -1
- data/lib/cbeta/p5a_to_html.rb +8 -16
- data/lib/cbeta/p5a_to_html_for_every_edition.rb +10 -21
- data/lib/cbeta/p5a_to_html_for_pdf.rb +3 -10
- data/lib/cbeta/p5a_to_simple_html.rb +3 -7
- data/lib/cbeta/p5a_to_text.rb +116 -120
- data/lib/cbeta/p5a_validator.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a3f8edd88730de817c9ecec347b516f552cd148a
|
4
|
+
data.tar.gz: 3f9b2eb3e8010094ce3ce47b84e261319d8ccac5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7e05cadab483733f7a5c531966fca41fcd453e1da9ae44f64433bf04e72d3ae9b129ea7b4dc0f94c3e7f304d9d5755ec192a572cd463a6528d3088ed4bcab5f0
|
7
|
+
data.tar.gz: 15f31074085847f332c3633f7de85706167c059709ae48a889081b9ebbe26f9e32cf626a07c8492cb29c6aadd2089b11e2dae16a66406d8c44038961047a55eb
|
data/lib/cbeta.rb
CHANGED
@@ -6,8 +6,41 @@
|
|
6
6
|
require 'csv'
|
7
7
|
|
8
8
|
class CBETA
|
9
|
+
CANON = 'DA|GA|GB|[A-Z]'
|
9
10
|
DATA = File.join(File.dirname(__FILE__), 'data')
|
10
11
|
PUNCS = '.[]。,、?「」『』《》<>〈〉〔〕[]【】〖〗'
|
12
|
+
|
13
|
+
# 由 行首資訊 取得 藏經 ID
|
14
|
+
# @param linehead[String] 行首資訊, 例如 "T01n0001_p0001a01" 或 "GA009n0008_p0003a01"
|
15
|
+
# @return [String] 藏經 ID,例如 "T" 或 "GA"
|
16
|
+
def self.get_canon_id_from_linehead(linehead)
|
17
|
+
linehead.sub(/^(#{CANON}).*$/, '\1')
|
18
|
+
end
|
19
|
+
|
20
|
+
# 由 冊號 取得 藏經 ID
|
21
|
+
# @param vol[String] 冊號, 例如 "T01" 或 "GA009"
|
22
|
+
# @return [String] 藏經 ID,例如 "T" 或 "GA"
|
23
|
+
def self.get_canon_from_vol(vol)
|
24
|
+
vol.sub(/^(#{CANON}).*$/, '\1')
|
25
|
+
end
|
26
|
+
|
27
|
+
# 由 行首資訊 取得 XML檔相對路徑
|
28
|
+
# @param linehead[String] 行首資訊, 例如 "GA009n0008_p0003a01"
|
29
|
+
# @return [String] XML檔相對路徑,例如 "GA/GA009/GA009n0008.xml"
|
30
|
+
def self.linehead_to_xml_file_path(linehead)
|
31
|
+
if m = linehead.match(/^(?<work>(?<vol>(?<canon>#{CANON})\d+)n\d+[a-zA-Z]?).*$/)
|
32
|
+
File.join(m[:canon], m[:vol], m[:work]+'.xml')
|
33
|
+
else
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# 由 XML檔主檔名 取得 典籍編號
|
39
|
+
# @param fn[String] 檔名, 例如 "T01n0001" 或 "GA009n0008"
|
40
|
+
# @return [String] 典籍編號,例如 "T0001" 或 "GA0008"
|
41
|
+
def self.get_work_id_from_file_basename(fn)
|
42
|
+
fn.sub(/^(#{CANON})\d{2,3}n(.*)$/, '\1\2')
|
43
|
+
end
|
11
44
|
|
12
45
|
# 將行首資訊轉為引用格式
|
13
46
|
#
|
@@ -18,7 +51,7 @@ class CBETA
|
|
18
51
|
# CBETA.linehead_to_s('T85n2838_p1291a03')
|
19
52
|
# # return "T85, no. 2838, p. 1291, a03"
|
20
53
|
def self.linehead_to_s(linehead)
|
21
|
-
linehead.match(/^(
|
54
|
+
linehead.match(/^((?:#{CANON})\d+)n(.*)_p(\d+)([a-z]\d+)$/) {
|
22
55
|
return "#{$1}, no. #{$2}, p. #{$3}, #{$4}"
|
23
56
|
}
|
24
57
|
nil
|
@@ -68,12 +101,12 @@ class CBETA
|
|
68
101
|
s = File.read(fn)
|
69
102
|
@categories = JSON.parse(s)
|
70
103
|
end
|
71
|
-
|
104
|
+
|
72
105
|
# @param id [String] 藏經 ID, 例如大正藏的 ID 是 "T"
|
73
106
|
# @return [String] 藏經短名,例如 "大正藏"
|
74
|
-
|
75
|
-
|
76
|
-
|
107
|
+
def get_canon_nickname(id)
|
108
|
+
return nil unless @canon_nickname.key? id
|
109
|
+
@canon_nickname[id]
|
77
110
|
end
|
78
111
|
|
79
112
|
# 取得藏經略符
|
data/lib/cbeta/gaiji.rb
CHANGED
data/lib/cbeta/html_to_text.rb
CHANGED
data/lib/cbeta/p5a_to_html.rb
CHANGED
@@ -540,10 +540,7 @@ class CBETA::P5aToHTML
|
|
540
540
|
text.gsub!(/(<a class='noteAnchor'[^>]*><\/a>)(<div class="lg-cell"[^>]*>)/, '\2\1')
|
541
541
|
|
542
542
|
juans = text.split(/(<juan \d+>)/)
|
543
|
-
open = false
|
544
|
-
fo = nil
|
545
543
|
juan_no = nil
|
546
|
-
fn = ''
|
547
544
|
buf = ''
|
548
545
|
# 一卷一檔
|
549
546
|
juans.each { |j|
|
@@ -618,9 +615,9 @@ class CBETA::P5aToHTML
|
|
618
615
|
abort "未處理底本" if @orig.nil?
|
619
616
|
|
620
617
|
@vol = vol
|
621
|
-
@series = vol
|
618
|
+
@series = CBETA.get_canon_from_vol(vol)
|
622
619
|
@out_folder = File.join(@out_root, @series, vol)
|
623
|
-
FileUtils.remove_dir(@out_folder,
|
620
|
+
FileUtils.remove_dir(@out_folder, true)
|
624
621
|
FileUtils::mkdir_p @out_folder
|
625
622
|
|
626
623
|
source = File.join(@xml_root, @series, vol)
|
@@ -631,7 +628,7 @@ class CBETA::P5aToHTML
|
|
631
628
|
|
632
629
|
def handle_vols(v1, v2)
|
633
630
|
puts "convert volumns: #{v1}..#{v2}"
|
634
|
-
@series = v1
|
631
|
+
@series = CBETA.get_canon_from_vol(v1)
|
635
632
|
folder = File.join(@xml_root, @series)
|
636
633
|
Dir.foreach(folder) { |vol|
|
637
634
|
next if vol < v1
|
@@ -677,16 +674,11 @@ class CBETA::P5aToHTML
|
|
677
674
|
end
|
678
675
|
|
679
676
|
def linehead_exist_in_cbeta(s)
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
path = File.join(@xml_root, corpus, vol, sutra+'.xml')
|
686
|
-
return File.exist? path
|
687
|
-
else
|
688
|
-
return false
|
689
|
-
end
|
677
|
+
fn = CBETA.linehead_to_xml_file_path(s)
|
678
|
+
return false if fn.nil?
|
679
|
+
|
680
|
+
path = File.join(@xml_root, fn)
|
681
|
+
File.exist? path
|
690
682
|
end
|
691
683
|
|
692
684
|
def open_xml(fn)
|
@@ -106,10 +106,7 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
106
106
|
text.gsub!(/(<a class='noteAnchor'[^>]*><\/a>)(<div class="lg-cell"[^>]*>)/, '\2\1')
|
107
107
|
|
108
108
|
juans = text.split(/(<juan \d+>)/)
|
109
|
-
open = false
|
110
|
-
fo = nil
|
111
109
|
juan_no = nil
|
112
|
-
fn = ''
|
113
110
|
buf = ''
|
114
111
|
# 一卷一檔
|
115
112
|
juans.each { |j|
|
@@ -122,8 +119,7 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
122
119
|
buf = ''
|
123
120
|
end
|
124
121
|
}
|
125
|
-
end
|
126
|
-
|
122
|
+
end
|
127
123
|
|
128
124
|
def convert_vol(vol)
|
129
125
|
puts "convert volumn: #{vol}"
|
@@ -414,12 +410,12 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
414
410
|
def handle_lem(e)
|
415
411
|
r = ''
|
416
412
|
content = traverse(e)
|
417
|
-
|
418
|
-
if
|
413
|
+
wit = e['wit']
|
414
|
+
if wit.include? 'CBETA' and not wit.include? @orig
|
419
415
|
n = @notes_dila[@juan].size + 1
|
420
416
|
r = "<a class='noteAnchor dila' href='#dila_note#{n}'></a>"
|
421
417
|
r += "<span class='cbeta'>%s</span>" % content
|
422
|
-
r = "<r w='#{
|
418
|
+
r = "<r w='#{wit}' l='#{@lb}'>#{r}</r>"
|
423
419
|
|
424
420
|
note = lem_note_cf(e)
|
425
421
|
note += lem_note_rdg(e)
|
@@ -625,7 +621,6 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
625
621
|
|
626
622
|
def handle_rdg(e)
|
627
623
|
r = traverse(e)
|
628
|
-
w = e['wit'].scan(/【.*?】/)
|
629
624
|
"<r w='#{e['wit']}' l='#{@lb}' w='#{@char_count}'>#{r}</r>"
|
630
625
|
end
|
631
626
|
|
@@ -788,16 +783,11 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
788
783
|
end
|
789
784
|
|
790
785
|
def linehead_exist_in_cbeta(s)
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
path = File.join(@xml_root, corpus, vol, sutra+'.xml')
|
797
|
-
return File.exist? path
|
798
|
-
else
|
799
|
-
return false
|
800
|
-
end
|
786
|
+
fn = CBETA.linehead_to_xml_file_path(s)
|
787
|
+
return false if fn.nil?
|
788
|
+
|
789
|
+
path = File.join(@xml_root, fn)
|
790
|
+
File.exist? path
|
801
791
|
end
|
802
792
|
|
803
793
|
def open_xml(fn)
|
@@ -872,10 +862,9 @@ class CBETA::P5aToHTMLForEveryEdition
|
|
872
862
|
else
|
873
863
|
work = @sutra_no.sub(/^([A-Z]{1,2})\d{2,3}n(.*)$/, '\1\2')
|
874
864
|
end
|
875
|
-
canon = work[0]
|
876
865
|
juan = "%03d" % juan_no
|
877
866
|
folder = File.join(@out_folder, work, juan)
|
878
|
-
FileUtils.remove_dir(folder,
|
867
|
+
FileUtils.remove_dir(folder, true)
|
879
868
|
FileUtils.makedirs folder
|
880
869
|
|
881
870
|
@editions.each do |ed|
|
@@ -198,8 +198,6 @@ class CBETA::P5aToHTMLForPDF
|
|
198
198
|
end
|
199
199
|
|
200
200
|
def handle_anchor(e)
|
201
|
-
id = e['id']
|
202
|
-
|
203
201
|
if e.has_attribute?('type')
|
204
202
|
if e['type'] == 'circle'
|
205
203
|
return '◎'
|
@@ -496,7 +494,6 @@ class CBETA::P5aToHTMLForPDF
|
|
496
494
|
end
|
497
495
|
|
498
496
|
def handle_note(e)
|
499
|
-
n = e['n']
|
500
497
|
if e.has_attribute?('type')
|
501
498
|
t = e['type']
|
502
499
|
if %w(equivalent orig orig_biao orig_ke mod rest).include? t
|
@@ -638,13 +635,9 @@ class CBETA::P5aToHTMLForPDF
|
|
638
635
|
abort "未處理底本" if @orig.nil?
|
639
636
|
|
640
637
|
@vol = vol
|
641
|
-
|
642
|
-
@series = 'DA'
|
643
|
-
else
|
644
|
-
@series = vol[0]
|
645
|
-
end
|
638
|
+
@series = CBETA.get_canon_from_vol(vol)
|
646
639
|
@out_folder = File.join(@out_root, @series, vol)
|
647
|
-
FileUtils.remove_dir(@out_folder,
|
640
|
+
FileUtils.remove_dir(@out_folder, true)
|
648
641
|
FileUtils::mkdir_p @out_folder
|
649
642
|
|
650
643
|
source = File.join(@xml_root, @series, vol)
|
@@ -657,7 +650,7 @@ class CBETA::P5aToHTMLForPDF
|
|
657
650
|
|
658
651
|
def handle_vols(v1, v2)
|
659
652
|
puts "convert volumns: #{v1}..#{v2}"
|
660
|
-
@series = v1
|
653
|
+
@series = CBETA.get_canon_from_vol(v1)
|
661
654
|
folder = File.join(@xml_root, @series)
|
662
655
|
Dir.foreach(folder) { |vol|
|
663
656
|
next if vol < v1
|
@@ -119,7 +119,6 @@ class CBETA::P5aToSimpleHTML
|
|
119
119
|
gid = e['ref'][1..-1]
|
120
120
|
g = @gaijis[gid]
|
121
121
|
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
122
|
-
zzs = g['zzs']
|
123
122
|
|
124
123
|
if gid.start_with?('SD') # 悉曇字
|
125
124
|
case gid
|
@@ -257,10 +256,7 @@ class CBETA::P5aToSimpleHTML
|
|
257
256
|
FileUtils.makedirs @out_sutra
|
258
257
|
|
259
258
|
juans = text.split(/(<juan \d+>)/)
|
260
|
-
open = false
|
261
|
-
fo = nil
|
262
259
|
juan_no = nil
|
263
|
-
fn = ''
|
264
260
|
buf = ''
|
265
261
|
# 一卷一檔
|
266
262
|
juans.each { |j|
|
@@ -324,9 +320,9 @@ class CBETA::P5aToSimpleHTML
|
|
324
320
|
@orig_short = @orig.sub(/^【(.*)】$/, '\1')
|
325
321
|
|
326
322
|
@vol = vol
|
327
|
-
@series = vol
|
323
|
+
@series = CBETA.get_canon_from_vol(vol)
|
328
324
|
@out_vol = File.join(@output_root, @series, vol)
|
329
|
-
FileUtils.remove_dir(@out_vol,
|
325
|
+
FileUtils.remove_dir(@out_vol, true)
|
330
326
|
FileUtils.makedirs @out_vol
|
331
327
|
|
332
328
|
source = File.join(@xml_root, @series, vol)
|
@@ -337,7 +333,7 @@ class CBETA::P5aToSimpleHTML
|
|
337
333
|
|
338
334
|
def handle_vols(v1, v2)
|
339
335
|
puts "convert volumns: #{v1}..#{v2}"
|
340
|
-
@series = v1
|
336
|
+
@series = CBETA.get_canon_from_vol(v1)
|
341
337
|
folder = File.join(@xml_root, @series)
|
342
338
|
Dir.foreach(folder) { |vol|
|
343
339
|
next if vol < v1
|
data/lib/cbeta/p5a_to_text.rb
CHANGED
@@ -70,8 +70,8 @@ class CBETA::P5aToText
|
|
70
70
|
return convert_all if target.nil?
|
71
71
|
|
72
72
|
arg = target.upcase
|
73
|
-
if arg.size
|
74
|
-
|
73
|
+
if arg.size <= 2
|
74
|
+
handle_canon(arg)
|
75
75
|
else
|
76
76
|
if arg.include? '..'
|
77
77
|
arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
|
@@ -124,7 +124,7 @@ class CBETA::P5aToText
|
|
124
124
|
def convert_all
|
125
125
|
Dir.entries(@xml_root).sort.each do |c|
|
126
126
|
next unless c.match(/^[A-Z]$/)
|
127
|
-
|
127
|
+
handle_canon(c)
|
128
128
|
end
|
129
129
|
end
|
130
130
|
|
@@ -138,7 +138,7 @@ class CBETA::P5aToText
|
|
138
138
|
r
|
139
139
|
end
|
140
140
|
|
141
|
-
def
|
141
|
+
def e_anchor(e)
|
142
142
|
if e.has_attribute?('type')
|
143
143
|
if e['type'] == 'circle'
|
144
144
|
return '◎'
|
@@ -148,53 +148,43 @@ class CBETA::P5aToText
|
|
148
148
|
''
|
149
149
|
end
|
150
150
|
|
151
|
-
def
|
151
|
+
def e_app(e)
|
152
152
|
traverse(e)
|
153
153
|
end
|
154
154
|
|
155
|
-
def
|
155
|
+
def e_byline(e)
|
156
156
|
r = traverse(e)
|
157
157
|
r += @settings[:format]=='app' ? "\t" : "\n"
|
158
158
|
r
|
159
159
|
end
|
160
160
|
|
161
|
-
def
|
161
|
+
def e_cell(e)
|
162
162
|
r = traverse(e)
|
163
163
|
r += @settings[:format]=='app' ? "\t" : "\n"
|
164
164
|
r
|
165
165
|
end
|
166
166
|
|
167
|
-
def
|
168
|
-
@series = c
|
169
|
-
puts 'handle_collection ' + c
|
170
|
-
folder = File.join(@xml_root, @series)
|
171
|
-
Dir.entries(folder).sort.each do |vol|
|
172
|
-
next if vol.start_with? '.'
|
173
|
-
handle_vol(vol)
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
def handle_corr(e)
|
167
|
+
def e_corr(e)
|
178
168
|
"<r w='【CBETA】'>%s</r>" % traverse(e)
|
179
169
|
end
|
180
170
|
|
181
|
-
def
|
171
|
+
def e_div(e)
|
182
172
|
traverse(e)
|
183
173
|
end
|
184
174
|
|
185
|
-
def
|
175
|
+
def e_docNumber(e)
|
186
176
|
r = traverse(e)
|
187
177
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
188
178
|
r
|
189
179
|
end
|
190
180
|
|
191
|
-
def
|
181
|
+
def e_figure(e)
|
192
182
|
r = traverse(e)
|
193
183
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
194
184
|
r
|
195
185
|
end
|
196
186
|
|
197
|
-
def
|
187
|
+
def e_g(e)
|
198
188
|
# if 悉曇字、蘭札體
|
199
189
|
# 使用 Unicode PUA
|
200
190
|
# else if 有 <mapping type="unicode">
|
@@ -215,7 +205,6 @@ class CBETA::P5aToText
|
|
215
205
|
|
216
206
|
g = @gaijis[gid]
|
217
207
|
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
218
|
-
zzs = g['zzs']
|
219
208
|
|
220
209
|
if gid.start_with?('SD') # 悉曇字
|
221
210
|
case gid
|
@@ -240,28 +229,28 @@ class CBETA::P5aToText
|
|
240
229
|
[0xf0000 + gid[2..-1].to_i].pack 'U'
|
241
230
|
end
|
242
231
|
|
243
|
-
def
|
232
|
+
def e_graphic(e)
|
244
233
|
''
|
245
234
|
end
|
246
235
|
|
247
|
-
def
|
236
|
+
def e_head(e)
|
248
237
|
r = traverse(e)
|
249
238
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
250
239
|
r
|
251
240
|
end
|
252
241
|
|
253
|
-
def
|
242
|
+
def e_item(e)
|
254
243
|
r = traverse(e)
|
255
244
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
256
245
|
end
|
257
246
|
|
258
|
-
def
|
247
|
+
def e_juan(e)
|
259
248
|
r = traverse(e)
|
260
249
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
261
250
|
r
|
262
251
|
end
|
263
252
|
|
264
|
-
def
|
253
|
+
def e_l(e)
|
265
254
|
r = traverse(e)
|
266
255
|
if @settings[:format] == 'app'
|
267
256
|
r += "\t"
|
@@ -271,7 +260,7 @@ class CBETA::P5aToText
|
|
271
260
|
r
|
272
261
|
end
|
273
262
|
|
274
|
-
def
|
263
|
+
def e_lb(e)
|
275
264
|
r = ''
|
276
265
|
if @settings[:format] == 'app'
|
277
266
|
r += "\n#{e['n']}║"
|
@@ -283,7 +272,7 @@ class CBETA::P5aToText
|
|
283
272
|
r
|
284
273
|
end
|
285
274
|
|
286
|
-
def
|
275
|
+
def e_lem(e)
|
287
276
|
# 沒有 rdg 的版本,用字同 lem
|
288
277
|
editions = Set.new @editions
|
289
278
|
e.xpath('./following-sibling::rdg').each do |rdg|
|
@@ -296,17 +285,17 @@ class CBETA::P5aToText
|
|
296
285
|
"<r w='#{w}'>%s</r>" % traverse(e)
|
297
286
|
end
|
298
287
|
|
299
|
-
def
|
288
|
+
def e_lg(e)
|
300
289
|
traverse(e)
|
301
290
|
end
|
302
291
|
|
303
|
-
def
|
292
|
+
def e_list(e)
|
304
293
|
r = ''
|
305
294
|
r += "\n" unless @settings[:format] == 'app'
|
306
295
|
r + traverse(e)
|
307
296
|
end
|
308
297
|
|
309
|
-
def
|
298
|
+
def e_milestone(e)
|
310
299
|
r = ''
|
311
300
|
if e['unit'] == 'juan'
|
312
301
|
@juan = e['n'].to_i
|
@@ -315,55 +304,11 @@ class CBETA::P5aToText
|
|
315
304
|
r
|
316
305
|
end
|
317
306
|
|
318
|
-
def
|
307
|
+
def e_mulu(e)
|
319
308
|
''
|
320
309
|
end
|
321
310
|
|
322
|
-
def
|
323
|
-
return '' if e.comment?
|
324
|
-
return handle_text(e) if e.text?
|
325
|
-
return '' if PASS.include?(e.name)
|
326
|
-
r = case e.name
|
327
|
-
when 'anchor' then handle_anchor(e)
|
328
|
-
when 'app' then handle_app(e)
|
329
|
-
when 'back' then ''
|
330
|
-
when 'byline' then handle_byline(e)
|
331
|
-
when 'cell' then handle_cell(e)
|
332
|
-
when 'corr' then handle_corr(e)
|
333
|
-
when 'div' then handle_div(e)
|
334
|
-
when 'docNumber' then handle_docNumber(e)
|
335
|
-
when 'figure' then handle_figure(e)
|
336
|
-
when 'foreign' then ''
|
337
|
-
when 'g' then handle_g(e)
|
338
|
-
when 'graphic' then handle_graphic(e)
|
339
|
-
when 'head' then handle_head(e)
|
340
|
-
when 'item' then handle_item(e)
|
341
|
-
when 'juan' then handle_juan(e)
|
342
|
-
when 'l' then handle_l(e)
|
343
|
-
when 'lb' then handle_lb(e)
|
344
|
-
when 'lem' then handle_lem(e)
|
345
|
-
when 'lg' then handle_lg(e)
|
346
|
-
when 'list' then handle_list(e)
|
347
|
-
when 'mulu' then handle_mulu(e)
|
348
|
-
when 'note' then handle_note(e)
|
349
|
-
when 'milestone' then handle_milestone(e)
|
350
|
-
when 'p' then handle_p(e)
|
351
|
-
when 'rdg' then handle_rdg(e)
|
352
|
-
when 'reg' then ''
|
353
|
-
when 'row' then handle_row(e)
|
354
|
-
when 'sic' then handle_sic(e)
|
355
|
-
when 'sg' then handle_sg(e)
|
356
|
-
when 'tt' then handle_tt(e)
|
357
|
-
when 't' then handle_t(e)
|
358
|
-
when 'table' then handle_table(e)
|
359
|
-
when 'teiHeader' then ''
|
360
|
-
when 'unclear' then '▆'
|
361
|
-
else traverse(e)
|
362
|
-
end
|
363
|
-
r
|
364
|
-
end
|
365
|
-
|
366
|
-
def handle_note(e)
|
311
|
+
def e_note(e)
|
367
312
|
if e.has_attribute?('place') && e['place']=='inline'
|
368
313
|
r = traverse(e)
|
369
314
|
return "(#{r})"
|
@@ -371,28 +316,108 @@ class CBETA::P5aToText
|
|
371
316
|
''
|
372
317
|
end
|
373
318
|
|
374
|
-
def
|
319
|
+
def e_p(e)
|
375
320
|
r = traverse(e)
|
376
321
|
r += @settings[:format] == 'app' ? "\t" : "\n"
|
377
322
|
r
|
378
323
|
end
|
379
324
|
|
380
|
-
def
|
325
|
+
def e_rdg(e)
|
381
326
|
"<r w='#{e['wit']}'>%s</r>" % traverse(e)
|
382
327
|
end
|
383
328
|
|
384
|
-
def
|
329
|
+
def e_row(e)
|
385
330
|
traverse(e)
|
386
331
|
end
|
387
332
|
|
388
|
-
def
|
333
|
+
def e_sg(e)
|
389
334
|
'(' + traverse(e) + ')'
|
390
335
|
end
|
391
336
|
|
392
|
-
def
|
337
|
+
def e_sic(e)
|
393
338
|
"<r w='#{@orig}'>" + traverse(e) + "</r>"
|
394
339
|
end
|
395
340
|
|
341
|
+
def e_t(e)
|
342
|
+
if e.has_attribute? 'place'
|
343
|
+
return '' if e['place'].include? 'foot'
|
344
|
+
end
|
345
|
+
r = traverse(e)
|
346
|
+
|
347
|
+
# 不是雙行對照
|
348
|
+
return r if @tt_type == 'app'
|
349
|
+
|
350
|
+
# 處理雙行對照
|
351
|
+
i = e.xpath('../t').index(e)
|
352
|
+
case i
|
353
|
+
when 0
|
354
|
+
return r + ' '
|
355
|
+
when 1
|
356
|
+
@next_line_buf += r + ' '
|
357
|
+
return ''
|
358
|
+
else
|
359
|
+
return r
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
363
|
+
def e_table(e)
|
364
|
+
traverse(e)
|
365
|
+
end
|
366
|
+
|
367
|
+
def handle_canon(c)
|
368
|
+
@canon = c
|
369
|
+
puts 'handle_canon ' + c
|
370
|
+
folder = File.join(@xml_root, @canon)
|
371
|
+
Dir.entries(folder).sort.each do |vol|
|
372
|
+
next if vol.start_with? '.'
|
373
|
+
handle_vol(vol)
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
def handle_node(e)
|
378
|
+
return '' if e.comment?
|
379
|
+
return handle_text(e) if e.text?
|
380
|
+
return '' if PASS.include?(e.name)
|
381
|
+
r = case e.name
|
382
|
+
when 'anchor' then e_anchor(e)
|
383
|
+
when 'app' then e_app(e)
|
384
|
+
when 'back' then ''
|
385
|
+
when 'byline' then e_byline(e)
|
386
|
+
when 'cell' then e_cell(e)
|
387
|
+
when 'corr' then e_corr(e)
|
388
|
+
when 'div' then e_div(e)
|
389
|
+
when 'docNumber' then e_docNumber(e)
|
390
|
+
when 'figure' then e_figure(e)
|
391
|
+
when 'foreign' then ''
|
392
|
+
when 'g' then e_g(e)
|
393
|
+
when 'graphic' then e_graphic(e)
|
394
|
+
when 'head' then e_head(e)
|
395
|
+
when 'item' then e_item(e)
|
396
|
+
when 'juan' then e_juan(e)
|
397
|
+
when 'l' then e_l(e)
|
398
|
+
when 'lb' then e_lb(e)
|
399
|
+
when 'lem' then e_lem(e)
|
400
|
+
when 'lg' then e_lg(e)
|
401
|
+
when 'list' then e_list(e)
|
402
|
+
when 'mulu' then e_mulu(e)
|
403
|
+
when 'note' then e_note(e)
|
404
|
+
when 'milestone' then e_milestone(e)
|
405
|
+
when 'p' then e_p(e)
|
406
|
+
when 'rdg' then e_rdg(e)
|
407
|
+
when 'reg' then ''
|
408
|
+
when 'row' then e_row(e)
|
409
|
+
when 'sic' then e_sic(e)
|
410
|
+
when 'sg' then e_sg(e)
|
411
|
+
when 'tt' then e_tt(e)
|
412
|
+
when 't' then e_t(e)
|
413
|
+
when 'table' then e_table(e)
|
414
|
+
when 'teiHeader' then ''
|
415
|
+
when 'unclear' then '▆'
|
416
|
+
else traverse(e)
|
417
|
+
end
|
418
|
+
r
|
419
|
+
end
|
420
|
+
|
396
421
|
def handle_sutra(xml_fn)
|
397
422
|
puts "convert sutra #{xml_fn}"
|
398
423
|
@dila_note = 0
|
@@ -418,10 +443,7 @@ class CBETA::P5aToText
|
|
418
443
|
FileUtils.makedirs @out_sutra
|
419
444
|
|
420
445
|
juans = text.split(/(<juan \d+>)/)
|
421
|
-
open = false
|
422
|
-
fo = nil
|
423
446
|
juan_no = nil
|
424
|
-
fn = ''
|
425
447
|
buf = ''
|
426
448
|
# 一卷一檔
|
427
449
|
juans.each { |j|
|
@@ -438,32 +460,6 @@ class CBETA::P5aToText
|
|
438
460
|
}
|
439
461
|
end
|
440
462
|
|
441
|
-
def handle_t(e)
|
442
|
-
if e.has_attribute? 'place'
|
443
|
-
return '' if e['place'].include? 'foot'
|
444
|
-
end
|
445
|
-
r = traverse(e)
|
446
|
-
|
447
|
-
# 不是雙行對照
|
448
|
-
return r if @tt_type == 'app'
|
449
|
-
|
450
|
-
# 處理雙行對照
|
451
|
-
i = e.xpath('../t').index(e)
|
452
|
-
case i
|
453
|
-
when 0
|
454
|
-
return r + ' '
|
455
|
-
when 1
|
456
|
-
@next_line_buf += r + ' '
|
457
|
-
return ''
|
458
|
-
else
|
459
|
-
return r
|
460
|
-
end
|
461
|
-
end
|
462
|
-
|
463
|
-
def handle_table(e)
|
464
|
-
traverse(e)
|
465
|
-
end
|
466
|
-
|
467
463
|
def handle_text(e)
|
468
464
|
s = e.content().chomp
|
469
465
|
return '' if s.empty?
|
@@ -476,7 +472,7 @@ class CBETA::P5aToText
|
|
476
472
|
CGI.escapeHTML(r)
|
477
473
|
end
|
478
474
|
|
479
|
-
def
|
475
|
+
def e_tt(e)
|
480
476
|
@tt_type = e['type']
|
481
477
|
traverse(e)
|
482
478
|
end
|
@@ -488,12 +484,12 @@ class CBETA::P5aToText
|
|
488
484
|
abort "未處理底本" if @orig.nil?
|
489
485
|
|
490
486
|
@vol = vol
|
491
|
-
@
|
492
|
-
@out_vol = File.join(@output_root, @
|
493
|
-
FileUtils.remove_dir(@out_vol,
|
487
|
+
@canon = CBETA.get_canon_from_vol(vol)
|
488
|
+
@out_vol = File.join(@output_root, @canon, vol)
|
489
|
+
FileUtils.remove_dir(@out_vol, true)
|
494
490
|
FileUtils.makedirs @out_vol
|
495
491
|
|
496
|
-
source = File.join(@xml_root, @
|
492
|
+
source = File.join(@xml_root, @canon, vol)
|
497
493
|
Dir.entries(source).sort.each { |f|
|
498
494
|
next if f.start_with? '.'
|
499
495
|
fn = File.join(source, f)
|
@@ -503,8 +499,8 @@ class CBETA::P5aToText
|
|
503
499
|
|
504
500
|
def handle_vols(v1, v2)
|
505
501
|
puts "convert volumns: #{v1}..#{v2}"
|
506
|
-
@
|
507
|
-
folder = File.join(@xml_root, @
|
502
|
+
@canon = get_canon_from_vol(v1)
|
503
|
+
folder = File.join(@xml_root, @canon)
|
508
504
|
Dir.entries(folder).sort.each do |vol|
|
509
505
|
next if vol < v1
|
510
506
|
next if vol > v2
|
data/lib/cbeta/p5a_validator.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.2.
|
4
|
+
version: 2.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|