cbeta 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +16 -0
- data/lib/cbeta/p5a_to_epub.rb +51 -18
- data/lib/cbeta/p5a_to_html_for_every_edition.rb +756 -0
- data/lib/data/categories.json +4095 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74ddab76ff4ed1e2d344968065e14c1a8927c3ad
|
4
|
+
data.tar.gz: 63d97041bacf9b3e45db780f1338a3ccd98b0540
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de3374f578bc6025e02f88b232c00ad7414eb9d0d23209b7661236a7d9f203b326e8e6d79dee96c853abac2765a7eafd4dad5d14f840edac480f01c26112e8d7
|
7
|
+
data.tar.gz: 1aacbf6fa0137843b85af1e9cfb21b1d0af44230edf66720c92b696350683feb7c3c8f8c855c0e1c1fd12c9f254cfd8d03926d53a2b749060603ddef7efd1649
|
data/lib/cbeta.rb
CHANGED
@@ -43,6 +43,10 @@ class CBETA
|
|
43
43
|
next if row['abbreviation'].empty?
|
44
44
|
@canon_abbr[row['id']] = row['abbreviation']
|
45
45
|
end
|
46
|
+
|
47
|
+
fn = File.join(File.dirname(__FILE__), 'data/categories.json')
|
48
|
+
s = File.read(fn)
|
49
|
+
@categories = JSON.parse(s)
|
46
50
|
end
|
47
51
|
|
48
52
|
# 取得藏經略符
|
@@ -71,12 +75,24 @@ class CBETA
|
|
71
75
|
return nil if r.nil?
|
72
76
|
r.sub(/^【(.*?)】$/, '\1')
|
73
77
|
end
|
78
|
+
|
79
|
+
# 傳入經號,取得部類
|
80
|
+
# @param book_id [String] Book ID (經號), ex. "T0220"
|
81
|
+
# @return [String] 部類名稱,例如 "阿含部類"
|
82
|
+
#
|
83
|
+
# @example
|
84
|
+
# cbeta = CBETA.new
|
85
|
+
# cbeta.get_category('T0220') # return '般若部類'
|
86
|
+
def get_category(book_id)
|
87
|
+
@categories[book_id]
|
88
|
+
end
|
74
89
|
end
|
75
90
|
|
76
91
|
require 'cbeta/gaiji'
|
77
92
|
require 'cbeta/bm_to_text'
|
78
93
|
require 'cbeta/p5a_to_epub'
|
79
94
|
require 'cbeta/p5a_to_html'
|
95
|
+
require 'cbeta/p5a_to_html_for_every_edition'
|
80
96
|
require 'cbeta/p5a_to_simple_html'
|
81
97
|
require 'cbeta/p5a_to_text'
|
82
98
|
require 'cbeta/p5a_validator'
|
data/lib/cbeta/p5a_to_epub.rb
CHANGED
@@ -27,6 +27,10 @@ class CBETA::P5aToEPUB
|
|
27
27
|
# @param temp_folder [String] 供 EPUB 暫存工作檔案的路徑
|
28
28
|
# @option opts [Integer] :epub_version (3) EPUB 版本
|
29
29
|
# @option opts [String] :graphic_base 圖檔路徑
|
30
|
+
# * graphic_base/covers: 封面圖檔位置
|
31
|
+
# * graphic_base/figures: 插圖圖檔位置
|
32
|
+
# * graphic_base/sd-gif: 悉曇字圖檔位置
|
33
|
+
# * graphic_base/rj-gif: 蘭札體圖檔位置
|
30
34
|
# @option opts [String] :front_page 內文前可以加一份 HTML 檔,例如「編輯說明」
|
31
35
|
# @option opts [String] :front_page_title 加在目錄的 front_page 標題
|
32
36
|
# @option opts [String] :back_page 內文後可以加一份 HTML 檔,例如「版權聲明」
|
@@ -75,8 +79,8 @@ class CBETA::P5aToEPUB
|
|
75
79
|
create_epub(output_path)
|
76
80
|
end
|
77
81
|
|
78
|
-
#
|
79
|
-
#
|
82
|
+
# 將某個資料夾下的每部作品都轉為一個對應的 EPUB。
|
83
|
+
# 跨冊的作品也會合成一個 EPUB。
|
80
84
|
#
|
81
85
|
# @example
|
82
86
|
# require 'cbeta'
|
@@ -87,19 +91,10 @@ class CBETA::P5aToEPUB
|
|
87
91
|
# c = CBETA::P5aToEPUB.new(TEMP, IMG)
|
88
92
|
# c.convert_folder('/Users/ray/Documents/Projects/D道安/xml-p5a/DA', '/temp/cbeta-epub/DA')
|
89
93
|
def convert_folder(input_folder, output_folder)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
p1 = File.join(input_folder, f)
|
95
|
-
if File.file?(p1)
|
96
|
-
f.sub!(/.xml$/, '.epub')
|
97
|
-
p2 = File.join(output_folder, f)
|
98
|
-
convert_file(p1, p2)
|
99
|
-
else
|
100
|
-
p2 = File.join(output_folder, f)
|
101
|
-
convert_folder(p1, p2)
|
102
|
-
end
|
94
|
+
@todo = {}
|
95
|
+
prepare_todo_list(input_folder, output_folder)
|
96
|
+
@todo.each_pair do |k, v|
|
97
|
+
convert_sutra(k, v[:xml_files], v[:epub])
|
103
98
|
end
|
104
99
|
end
|
105
100
|
|
@@ -129,13 +124,17 @@ class CBETA::P5aToEPUB
|
|
129
124
|
# ]
|
130
125
|
#
|
131
126
|
# c = CBETA::P5aToEPUB.new(TEMP)
|
132
|
-
# c.convert_sutra('T0220',
|
133
|
-
def convert_sutra(book_id,
|
127
|
+
# c.convert_sutra('T0220', xml_files, '/temp/cbeta-epub/T0220.epub')
|
128
|
+
def convert_sutra(book_id, xml_files, out)
|
134
129
|
@book_id = book_id
|
135
130
|
sutra_init
|
136
131
|
xml_files.each { |f| handle_file(f) }
|
137
132
|
|
138
|
-
|
133
|
+
if xml_files.size > 1
|
134
|
+
@title.sub!(/^(.*)\(.*?\)$/, '\1')
|
135
|
+
@title.sub!(/^(.*?)((.*?))+$/, '\1')
|
136
|
+
puts @title
|
137
|
+
end
|
139
138
|
create_epub(out)
|
140
139
|
end
|
141
140
|
|
@@ -199,6 +198,15 @@ class CBETA::P5aToEPUB
|
|
199
198
|
}
|
200
199
|
}
|
201
200
|
builder.book.version = @settings[:epub_version]
|
201
|
+
|
202
|
+
canon = book_id.sub(/^([A-Z]{1,2}).*$/, '\1')
|
203
|
+
cover = File.join(settings[:graphic_base], 'covers', canon, "#{book_id}.jpg")
|
204
|
+
if File.exist? cover
|
205
|
+
File.open(cover) do |io|
|
206
|
+
builder.book.add_item(cover, io).cover_image
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
202
210
|
builder.generate_epub(output_path)
|
203
211
|
puts "output: #{output_path}"
|
204
212
|
end
|
@@ -791,6 +799,31 @@ eos
|
|
791
799
|
text
|
792
800
|
end
|
793
801
|
|
802
|
+
def prepare_todo_list(input_folder, output_folder)
|
803
|
+
Dir.foreach(input_folder) do |f|
|
804
|
+
next if f.start_with? '.'
|
805
|
+
p1 = File.join(input_folder, f)
|
806
|
+
if File.file?(p1)
|
807
|
+
work = f.sub(/^([A-Z]{1,2})\d{2,3}n(.*)\.xml$/, '\1\2')
|
808
|
+
work = 'T0220' if work.start_with? 'T0220'
|
809
|
+
unless @todo.key? work
|
810
|
+
@todo[work] = { xml_files: [] }
|
811
|
+
end
|
812
|
+
hash = @todo[work]
|
813
|
+
hash[:xml_files] << p1
|
814
|
+
|
815
|
+
folders = output_folder.split('/')
|
816
|
+
folders.pop if folders[-1].match(/^[A-Z]{1,2}\d{2,3}$/)
|
817
|
+
folder = folders.join('/')
|
818
|
+
FileUtils::mkdir_p folder
|
819
|
+
hash[:epub] = File.join(folder, "#{work}.epub")
|
820
|
+
else
|
821
|
+
p2 = File.join(output_folder, f)
|
822
|
+
prepare_todo_list(p1, p2)
|
823
|
+
end
|
824
|
+
end
|
825
|
+
end
|
826
|
+
|
794
827
|
def remove_empty_nav(node_list)
|
795
828
|
node_list.each do |n|
|
796
829
|
if n[:nav].empty?
|
@@ -0,0 +1,756 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
require 'date'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'json'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
# Convert CBETA XML P5a to HTML for every edition
|
9
|
+
#
|
10
|
+
# CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
|
11
|
+
#
|
12
|
+
# 轉檔規則請參考: http://wiki.ddbc.edu.tw/pages/CBETA_XML_P5a_轉_HTML
|
13
|
+
class CBETA::P5aToHTMLForEveryEdition
|
14
|
+
# 內容不輸出的元素
|
15
|
+
PASS=['back', 'teiHeader']
|
16
|
+
|
17
|
+
# 某版用字缺的符號
|
18
|
+
MISSING = '-'
|
19
|
+
|
20
|
+
private_constant :PASS, :MISSING
|
21
|
+
|
22
|
+
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
23
|
+
# @param out_root [String] 輸出 HTML 路徑
|
24
|
+
def initialize(xml_root, out_root)
|
25
|
+
@xml_root = xml_root
|
26
|
+
@out_root = out_root
|
27
|
+
@cbeta = CBETA.new
|
28
|
+
@gaijis = CBETA::Gaiji.new
|
29
|
+
end
|
30
|
+
|
31
|
+
# 將 CBETA XML P5a 轉為 HTML
|
32
|
+
#
|
33
|
+
# @example for convert 大正藏第一冊:
|
34
|
+
#
|
35
|
+
# x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
36
|
+
# x2h.convert('T01')
|
37
|
+
#
|
38
|
+
# @example for convert 大正藏全部:
|
39
|
+
#
|
40
|
+
# x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
41
|
+
# x2h.convert('T')
|
42
|
+
#
|
43
|
+
# @example for convert 大正藏第五冊至第七冊:
|
44
|
+
#
|
45
|
+
# x2h = CBETA::P5aToHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
|
46
|
+
# x2h.convert('T05..T07')
|
47
|
+
#
|
48
|
+
# T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
|
49
|
+
def convert(target=nil)
|
50
|
+
return convert_all if target.nil?
|
51
|
+
|
52
|
+
arg = target.upcase
|
53
|
+
if arg.size == 1
|
54
|
+
handle_collection(arg)
|
55
|
+
else
|
56
|
+
if arg.include? '..'
|
57
|
+
arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
|
58
|
+
handle_vols($1, $2)
|
59
|
+
}
|
60
|
+
else
|
61
|
+
handle_vol(arg)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def convert_all
|
69
|
+
Dir.foreach(@xml_root) { |c|
|
70
|
+
next unless c.match(/^[A-Z]$/)
|
71
|
+
handle_collection(c)
|
72
|
+
}
|
73
|
+
end
|
74
|
+
|
75
|
+
def handle_anchor(e)
|
76
|
+
id = e['id']
|
77
|
+
if e.has_attribute?('id')
|
78
|
+
if id.start_with?('nkr_note_orig')
|
79
|
+
note = @notes[id]
|
80
|
+
note_text = traverse(note)
|
81
|
+
n = id[/^nkr_note_orig_(.*)$/, 1]
|
82
|
+
@back[@juan] += "<span class='footnote' id='n#{n}'>#{note_text}</span>\n"
|
83
|
+
return "<a class='noteAnchor' href='#n#{n}'></a>"
|
84
|
+
elsif id.start_with? 'fx'
|
85
|
+
return "<span class='star'>[*]</span>"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
if e.has_attribute?('type')
|
90
|
+
if e['type'] == 'circle'
|
91
|
+
return '◎'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
''
|
96
|
+
end
|
97
|
+
|
98
|
+
def handle_app(e)
|
99
|
+
r = ''
|
100
|
+
if e['type'] == 'star'
|
101
|
+
c = e['corresp'][1..-1]
|
102
|
+
r = "<a class='noteAnchor star' href='#n#{c}'></a>"
|
103
|
+
end
|
104
|
+
r + traverse(e)
|
105
|
+
end
|
106
|
+
|
107
|
+
def handle_byline(e)
|
108
|
+
r = '<p class="byline">'
|
109
|
+
r += "<span class='lineInfo'>#{@lb}</span>"
|
110
|
+
r += traverse(e)
|
111
|
+
r + '</p>'
|
112
|
+
end
|
113
|
+
|
114
|
+
def handle_cell(e)
|
115
|
+
doc = Nokogiri::XML::Document.new
|
116
|
+
cell = doc.create_element('div')
|
117
|
+
cell['class'] = 'bip-table-cell'
|
118
|
+
cell['rowspan'] = e['rows'] if e.key? 'rows'
|
119
|
+
cell['colspan'] = e['cols'] if e.key? 'cols'
|
120
|
+
cell.inner_html = traverse(e)
|
121
|
+
to_html(cell)
|
122
|
+
end
|
123
|
+
|
124
|
+
def handle_collection(c)
|
125
|
+
@series = c
|
126
|
+
puts 'handle_collection ' + c
|
127
|
+
folder = File.join(@xml_root, @series)
|
128
|
+
Dir.foreach(folder) { |vol|
|
129
|
+
next if ['.', '..', '.DS_Store'].include? vol
|
130
|
+
handle_vol(vol)
|
131
|
+
}
|
132
|
+
end
|
133
|
+
|
134
|
+
def handle_corr(e)
|
135
|
+
"<r w='【CBETA】' l='#{@lb}' w='#{@char_count}'>%s</r>" % traverse(e)
|
136
|
+
end
|
137
|
+
|
138
|
+
def handle_div(e)
|
139
|
+
@div_count += 1
|
140
|
+
n = @div_count
|
141
|
+
if e.has_attribute? 'type'
|
142
|
+
@open_divs << e
|
143
|
+
r = traverse(e)
|
144
|
+
@open_divs.pop
|
145
|
+
return "<!-- begin div#{n}--><div class='div-#{e['type']}'>#{r}</div><!-- end of div#{n} -->"
|
146
|
+
else
|
147
|
+
return traverse(e)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def handle_figure(e)
|
152
|
+
"<p class='figure'>%s</p>" % traverse(e)
|
153
|
+
end
|
154
|
+
|
155
|
+
def handle_g(e, mode)
|
156
|
+
# if 有 <mapping type="unicode">
|
157
|
+
# if 不在 Unicode Extension C, D, E 範圍裡
|
158
|
+
# 直接採用
|
159
|
+
# else
|
160
|
+
# 預設呈現 unicode, 但仍包缺字資訊,供點選開 popup
|
161
|
+
# else if 有 <mapping type="normal_unicode">
|
162
|
+
# 預設呈現 normal_unicode, 但仍包缺字資訊,供點選開 popup
|
163
|
+
# else if 有 normalized form
|
164
|
+
# 預設呈現 normalized form, 但仍包缺字資訊,供點選開 popup
|
165
|
+
# else
|
166
|
+
# 預設呈現組字式, 但仍包缺字資訊,供點選開 popup
|
167
|
+
gid = e['ref'][1..-1]
|
168
|
+
g = @gaijis[gid]
|
169
|
+
abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
|
170
|
+
zzs = g['zzs']
|
171
|
+
|
172
|
+
if mode == 'txt'
|
173
|
+
return g['roman'] if gid.start_with?('SD')
|
174
|
+
if zzs.nil?
|
175
|
+
abort "缺組字式:#{g}"
|
176
|
+
else
|
177
|
+
return zzs
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
@char_count += 1
|
182
|
+
|
183
|
+
if gid.start_with?('SD')
|
184
|
+
case gid
|
185
|
+
when 'SD-E35A'
|
186
|
+
return '('
|
187
|
+
when 'SD-E35B'
|
188
|
+
return ')'
|
189
|
+
else
|
190
|
+
return "<span class='siddam' roman='#{g['roman']}' code='#{gid}' char='#{g['sd-char']}'/>"
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
if gid.start_with?('RJ')
|
195
|
+
return "<span class='ranja' roman='#{g['roman']}' code='#{gid}' char='#{g['rj-char']}'/>"
|
196
|
+
end
|
197
|
+
|
198
|
+
default = ''
|
199
|
+
if g.has_key?('unicode')
|
200
|
+
#if @unicode1.include?(g['unicode'])
|
201
|
+
# 如果在 unicode ext-C, ext-D, ext-E 範圍內
|
202
|
+
if (0x2A700..0x2CEAF).include? g['unicode'].hex
|
203
|
+
default = g['unicode-char']
|
204
|
+
else
|
205
|
+
return g['unicode-char'] # 直接採用 unicode
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
nor = ''
|
210
|
+
if g.has_key?('normal_unicode')
|
211
|
+
nor = g['normal_unicode']
|
212
|
+
default = nor if default.empty?
|
213
|
+
end
|
214
|
+
|
215
|
+
if g.has_key?('normal')
|
216
|
+
nor += ', ' unless nor==''
|
217
|
+
nor += g['normal']
|
218
|
+
default = g['normal'] if default.empty?
|
219
|
+
end
|
220
|
+
|
221
|
+
default = zzs if default.empty?
|
222
|
+
|
223
|
+
href = 'http://dict.cbeta.org/dict_word/gaiji-cb/%s/%s.gif' % [gid[2, 2], gid]
|
224
|
+
unless @back[@juan].include?(href)
|
225
|
+
@back[@juan] += "<span id='#{gid}' class='gaijiInfo' figure_url='#{href}' zzs='#{zzs}' nor='#{nor}'>#{default}</span>\n"
|
226
|
+
end
|
227
|
+
"<a class='gaijiAnchor' href='##{gid}'>#{default}</a>"
|
228
|
+
end
|
229
|
+
|
230
|
+
def handle_graphic(e)
|
231
|
+
url = File.basename(e['url'])
|
232
|
+
"<span imgsrc='#{url}' class='graphic'></span>"
|
233
|
+
end
|
234
|
+
|
235
|
+
def handle_head(e)
|
236
|
+
r = ''
|
237
|
+
unless e['type'] == 'added'
|
238
|
+
i = @open_divs.size
|
239
|
+
r = "<p class='head' data-head-level='#{i}'>%s</p>" % traverse(e)
|
240
|
+
end
|
241
|
+
r
|
242
|
+
end
|
243
|
+
|
244
|
+
def handle_item(e)
|
245
|
+
"<li>%s</li>\n" % traverse(e)
|
246
|
+
end
|
247
|
+
|
248
|
+
def handle_juan(e)
|
249
|
+
"<p class='juan'>%s</p>" % traverse(e)
|
250
|
+
end
|
251
|
+
|
252
|
+
def handle_l(e)
|
253
|
+
if @lg_type == 'abnormal'
|
254
|
+
return traverse(e)
|
255
|
+
end
|
256
|
+
|
257
|
+
@in_l = true
|
258
|
+
|
259
|
+
doc = Nokogiri::XML::Document.new
|
260
|
+
cell = doc.create_element('div')
|
261
|
+
cell['class'] = 'lg-cell'
|
262
|
+
cell.inner_html = traverse(e)
|
263
|
+
|
264
|
+
if @first_l
|
265
|
+
parent = e.parent()
|
266
|
+
if parent.has_attribute?('rend')
|
267
|
+
indent = parent['rend'].scan(/text-indent:[^:]*/)
|
268
|
+
unless indent.empty?
|
269
|
+
cell['style'] = indent[0]
|
270
|
+
end
|
271
|
+
end
|
272
|
+
@first_l = false
|
273
|
+
end
|
274
|
+
r = to_html(cell)
|
275
|
+
|
276
|
+
unless @lg_row_open
|
277
|
+
r = "\n<div class='lg-row'>" + r
|
278
|
+
@lg_row_open = true
|
279
|
+
end
|
280
|
+
@in_l = false
|
281
|
+
r
|
282
|
+
end
|
283
|
+
|
284
|
+
def handle_lb(e)
|
285
|
+
# 卍續藏有 X 跟 R 兩種 lb, 只處理 X
|
286
|
+
return '' if e['ed'] != @series
|
287
|
+
|
288
|
+
@char_count = 1
|
289
|
+
@lb = e['n']
|
290
|
+
line_head = @sutra_no + '_p' + e['n']
|
291
|
+
r = ''
|
292
|
+
#if e.parent.name == 'lg' and $lg_row_open
|
293
|
+
if @lg_row_open && !@in_l
|
294
|
+
# 每行偈頌放在一個 lg-row 裡面
|
295
|
+
# T46n1937, p. 914a01, l 包雙行夾註跨行
|
296
|
+
# T20n1092, 337c16, lb 在 l 中間,不結束 lg-row
|
297
|
+
r += "</div><!-- end of lg-row -->"
|
298
|
+
@lg_row_open = false
|
299
|
+
end
|
300
|
+
r += "<span class='lb' id='#{line_head}'>#{line_head}</span>"
|
301
|
+
unless @next_line_buf.empty?
|
302
|
+
r += @next_line_buf
|
303
|
+
@next_line_buf = ''
|
304
|
+
end
|
305
|
+
r
|
306
|
+
end
|
307
|
+
|
308
|
+
def handle_lem(e)
|
309
|
+
w = e['wit'].scan(/【.*?】/)
|
310
|
+
@editions.merge w
|
311
|
+
w = w.join(' ')
|
312
|
+
|
313
|
+
r = traverse(e)
|
314
|
+
"<r w='#{w}' l='#{@lb}' w='#{@char_count}'>#{r}</r>"
|
315
|
+
end
|
316
|
+
|
317
|
+
def handle_lg(e)
|
318
|
+
r = ''
|
319
|
+
@lg_type = e['type']
|
320
|
+
if @lg_type == 'abnormal'
|
321
|
+
r = "<p class='lg-abnormal'>" + traverse(e) + "</p>"
|
322
|
+
else
|
323
|
+
@first_l = true
|
324
|
+
doc = Nokogiri::XML::Document.new
|
325
|
+
node = doc.create_element('div')
|
326
|
+
node['class'] = 'lg'
|
327
|
+
if e.has_attribute?('rend')
|
328
|
+
rend = e['rend'].gsub(/text-indent:[^:]*/, '')
|
329
|
+
node['style'] = rend
|
330
|
+
end
|
331
|
+
@lg_row_open = false
|
332
|
+
node.inner_html = traverse(e)
|
333
|
+
if @lg_row_open
|
334
|
+
node.inner_html += '</div><!-- end of lg -->'
|
335
|
+
@lg_row_open = false
|
336
|
+
end
|
337
|
+
r = "\n" + to_html(node)
|
338
|
+
end
|
339
|
+
r
|
340
|
+
end
|
341
|
+
|
342
|
+
def handle_list(e)
|
343
|
+
"<ul>%s</ul>" % traverse(e)
|
344
|
+
end
|
345
|
+
|
346
|
+
def handle_milestone(e)
|
347
|
+
r = ''
|
348
|
+
if e['unit'] == 'juan'
|
349
|
+
|
350
|
+
r += "</div>" * @open_divs.size # 如果有 div 跨卷,要先結束, ex: T55n2154, p. 680a29, 跨 19, 20 兩卷
|
351
|
+
@juan = e['n'].to_i
|
352
|
+
@back[@juan] = @back[0]
|
353
|
+
r += "<juan #{@juan}>"
|
354
|
+
@open_divs.each { |d|
|
355
|
+
r += "<div class='div-#{d['type']}'>"
|
356
|
+
}
|
357
|
+
end
|
358
|
+
r
|
359
|
+
end
|
360
|
+
|
361
|
+
def handle_mulu(e)
|
362
|
+
r = ''
|
363
|
+
if e['type'] == '品'
|
364
|
+
@pass << false
|
365
|
+
r = "<mulu class='pin' s='%s'/>" % traverse(e, 'txt')
|
366
|
+
@pass.pop
|
367
|
+
end
|
368
|
+
r
|
369
|
+
end
|
370
|
+
|
371
|
+
def handle_node(e, mode)
|
372
|
+
return '' if e.comment?
|
373
|
+
return handle_text(e, mode) if e.text?
|
374
|
+
return '' if PASS.include?(e.name)
|
375
|
+
r = case e.name
|
376
|
+
when 'anchor' then handle_anchor(e)
|
377
|
+
when 'app' then handle_app(e)
|
378
|
+
when 'byline' then handle_byline(e)
|
379
|
+
when 'cell' then handle_cell(e)
|
380
|
+
when 'corr' then handle_corr(e)
|
381
|
+
when 'div' then handle_div(e)
|
382
|
+
when 'figure' then handle_figure(e)
|
383
|
+
when 'foreign' then ''
|
384
|
+
when 'g' then handle_g(e, mode)
|
385
|
+
when 'graphic' then handle_graphic(e)
|
386
|
+
when 'head' then handle_head(e)
|
387
|
+
when 'item' then handle_item(e)
|
388
|
+
when 'juan' then handle_juan(e)
|
389
|
+
when 'l' then handle_l(e)
|
390
|
+
when 'lb' then handle_lb(e)
|
391
|
+
when 'lem' then handle_lem(e)
|
392
|
+
when 'lg' then handle_lg(e)
|
393
|
+
when 'list' then handle_list(e)
|
394
|
+
when 'mulu' then handle_mulu(e)
|
395
|
+
when 'note' then handle_note(e)
|
396
|
+
when 'milestone' then handle_milestone(e)
|
397
|
+
when 'p' then handle_p(e)
|
398
|
+
when 'rdg' then handle_rdg(e)
|
399
|
+
when 'reg' then ''
|
400
|
+
when 'row' then handle_row(e)
|
401
|
+
when 'sic' then handle_sic(e)
|
402
|
+
when 'sg' then handle_sg(e)
|
403
|
+
when 't' then handle_t(e)
|
404
|
+
when 'tt' then handle_tt(e)
|
405
|
+
when 'table' then handle_table(e)
|
406
|
+
else traverse(e)
|
407
|
+
end
|
408
|
+
r
|
409
|
+
end
|
410
|
+
|
411
|
+
def handle_note(e)
|
412
|
+
n = e['n']
|
413
|
+
if e.has_attribute?('type')
|
414
|
+
t = e['type']
|
415
|
+
case t
|
416
|
+
when 'equivalent'
|
417
|
+
return ''
|
418
|
+
when 'orig'
|
419
|
+
return handle_note_orig(e)
|
420
|
+
when 'orig_biao'
|
421
|
+
return handle_note_orig(e, 'biao')
|
422
|
+
when 'orig_ke'
|
423
|
+
return handle_note_orig(e, 'ke')
|
424
|
+
when 'mod'
|
425
|
+
@pass << false
|
426
|
+
s = traverse(e)
|
427
|
+
@pass.pop
|
428
|
+
@back[@juan] += "<span class='footnote_cb' id='n#{n}'>#{s}</span>\n"
|
429
|
+
return "<a class='noteAnchor' href='#n#{n}'></a>"
|
430
|
+
when 'rest'
|
431
|
+
return ''
|
432
|
+
else
|
433
|
+
return '' if t.start_with?('cf')
|
434
|
+
end
|
435
|
+
end
|
436
|
+
|
437
|
+
if e.has_attribute?('resp')
|
438
|
+
return '' if e['resp'].start_with? 'CBETA'
|
439
|
+
end
|
440
|
+
|
441
|
+
if e.has_attribute?('place') && e['place']=='inline'
|
442
|
+
r = traverse(e)
|
443
|
+
return "<span class='doube-line-note'>#{r}</span>"
|
444
|
+
else
|
445
|
+
return traverse(e)
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
def handle_note_orig(e, anchor_type=nil)
|
450
|
+
n = e['n']
|
451
|
+
@pass << false
|
452
|
+
s = traverse(e)
|
453
|
+
@pass.pop
|
454
|
+
@back[@juan] += "<span class='footnote_orig' id='n#{n}'>#{s}</span>\n"
|
455
|
+
|
456
|
+
if @mod_notes.include? n
|
457
|
+
return ''
|
458
|
+
else
|
459
|
+
label = case anchor_type
|
460
|
+
when 'biao' then " data-label='標#{n[-2..-1]}'"
|
461
|
+
when 'ke' then " data-label='科#{n[-2..-1]}'"
|
462
|
+
else ''
|
463
|
+
end
|
464
|
+
return "<a class='noteAnchor' href='#n#{n}'#{label}></a>"
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
def handle_p(e)
|
469
|
+
r = '<p>'
|
470
|
+
r += "<span class='lineInfo'>#{@lb}</span>"
|
471
|
+
r += traverse(e)
|
472
|
+
r + '</p>'
|
473
|
+
end
|
474
|
+
|
475
|
+
def handle_rdg(e)
|
476
|
+
r = traverse(e)
|
477
|
+
w = e['wit'].scan(/【.*?】/)
|
478
|
+
@editions.merge w
|
479
|
+
"<r w='#{e['wit']}' l='#{@lb}' w='#{@char_count}'>#{r}</r>"
|
480
|
+
end
|
481
|
+
|
482
|
+
def handle_row(e)
|
483
|
+
"<div class='bip-table-row'>" + traverse(e) + "</div>"
|
484
|
+
end
|
485
|
+
|
486
|
+
def handle_sg(e)
|
487
|
+
'(' + traverse(e) + ')'
|
488
|
+
end
|
489
|
+
|
490
|
+
def handle_sic(e)
|
491
|
+
"<r w='#{@orig}' l='#{@lb}' w='#{@char_count}'>" + traverse(e) + "</r>"
|
492
|
+
end
|
493
|
+
|
494
|
+
def handle_sutra(xml_fn)
|
495
|
+
puts "convert sutra #{xml_fn}"
|
496
|
+
@editions = Set.new ["【CBETA】"]
|
497
|
+
@back = { 0 => '' }
|
498
|
+
@char_count = 1
|
499
|
+
@dila_note = 0
|
500
|
+
@div_count = 0
|
501
|
+
@in_l = false
|
502
|
+
@juan = 0
|
503
|
+
@lg_row_open = false
|
504
|
+
@mod_notes = Set.new
|
505
|
+
@next_line_buf = ''
|
506
|
+
@open_divs = []
|
507
|
+
@sutra_no = File.basename(xml_fn, ".xml")
|
508
|
+
|
509
|
+
text = parse_xml(xml_fn)
|
510
|
+
|
511
|
+
# 註標移到 lg-cell 裡面,不然以 table 呈現 lg 會有問題
|
512
|
+
text.gsub!(/(<a class='noteAnchor'[^>]*><\/a>)(<div class="lg-cell"[^>]*>)/, '\2\1')
|
513
|
+
|
514
|
+
juans = text.split(/(<juan \d+>)/)
|
515
|
+
open = false
|
516
|
+
fo = nil
|
517
|
+
juan_no = nil
|
518
|
+
fn = ''
|
519
|
+
buf = ''
|
520
|
+
# 一卷一檔
|
521
|
+
juans.each { |j|
|
522
|
+
if j =~ /<juan (\d+)>$/
|
523
|
+
juan_no = $1.to_i
|
524
|
+
elsif juan_no.nil?
|
525
|
+
buf = j
|
526
|
+
else
|
527
|
+
write_juan(juan_no, buf+j)
|
528
|
+
buf = ''
|
529
|
+
end
|
530
|
+
}
|
531
|
+
end
|
532
|
+
|
533
|
+
def handle_t(e)
|
534
|
+
if e.has_attribute? 'place'
|
535
|
+
return '' if e['place'].include? 'foot'
|
536
|
+
end
|
537
|
+
r = traverse(e)
|
538
|
+
|
539
|
+
# <tt type="app"> 不是 悉漢雙行對照
|
540
|
+
return r if @tt_type == 'app'
|
541
|
+
|
542
|
+
# 處理雙行對照
|
543
|
+
i = e.xpath('../t').index(e)
|
544
|
+
case i
|
545
|
+
when 0
|
546
|
+
return r + ' '
|
547
|
+
when 1
|
548
|
+
@next_line_buf += r + ' '
|
549
|
+
return ''
|
550
|
+
else
|
551
|
+
return r
|
552
|
+
end
|
553
|
+
end
|
554
|
+
|
555
|
+
def handle_tt(e)
|
556
|
+
@tt_type = e['type']
|
557
|
+
traverse(e)
|
558
|
+
end
|
559
|
+
|
560
|
+
def handle_table(e)
|
561
|
+
"<div class='bip-table'>" + traverse(e) + "</div>"
|
562
|
+
end
|
563
|
+
|
564
|
+
def handle_text(e, mode)
|
565
|
+
s = e.content().chomp
|
566
|
+
return '' if s.empty?
|
567
|
+
return '' if e.parent.name == 'app'
|
568
|
+
|
569
|
+
# cbeta xml 文字之間會有多餘的換行
|
570
|
+
r = s.gsub(/[\n\r]/, '')
|
571
|
+
|
572
|
+
text_size = r.size
|
573
|
+
|
574
|
+
# 把 & 轉為 &
|
575
|
+
r = CGI.escapeHTML(r)
|
576
|
+
|
577
|
+
# 正文區的文字外面要包 span
|
578
|
+
if @pass.last and mode=='html'
|
579
|
+
r = "<span class='t' l='#{@lb}' w='#{@char_count}'>#{r}</span>"
|
580
|
+
@char_count += text_size
|
581
|
+
end
|
582
|
+
r
|
583
|
+
end
|
584
|
+
|
585
|
+
def handle_vol(vol)
|
586
|
+
puts "convert volumn: #{vol}"
|
587
|
+
|
588
|
+
@orig = @cbeta.get_canon_symbol(vol[0])
|
589
|
+
abort "未處理底本" if @orig.nil?
|
590
|
+
|
591
|
+
@vol = vol
|
592
|
+
@series = vol[0]
|
593
|
+
@out_folder = File.join(@out_root, @series)
|
594
|
+
FileUtils::mkdir_p @out_folder
|
595
|
+
|
596
|
+
source = File.join(@xml_root, @series, vol)
|
597
|
+
Dir[source+"/*"].each { |f|
|
598
|
+
handle_sutra(f)
|
599
|
+
}
|
600
|
+
end
|
601
|
+
|
602
|
+
def handle_vols(v1, v2)
|
603
|
+
puts "convert volumns: #{v1}..#{v2}"
|
604
|
+
@series = v1[0]
|
605
|
+
folder = File.join(@xml_root, @series)
|
606
|
+
Dir.foreach(folder) { |vol|
|
607
|
+
next if vol < v1
|
608
|
+
next if vol > v2
|
609
|
+
handle_vol(vol)
|
610
|
+
}
|
611
|
+
end
|
612
|
+
|
613
|
+
def lem_note_cf(e)
|
614
|
+
# ex: T32n1670A.xml, p. 703a16
|
615
|
+
# <note type="cf1">K30n1002_p0257a01-a23</note>
|
616
|
+
refs = []
|
617
|
+
e.xpath('./note').each { |n|
|
618
|
+
if n.key?('type') and n['type'].start_with? 'cf'
|
619
|
+
s = n.content
|
620
|
+
if linehead_exist_in_cbeta(s)
|
621
|
+
s = "<span class='note_cf'>#{s}</span>"
|
622
|
+
end
|
623
|
+
refs << s
|
624
|
+
end
|
625
|
+
}
|
626
|
+
if refs.empty?
|
627
|
+
''
|
628
|
+
else
|
629
|
+
'修訂依據:' + refs.join(';') + '。'
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
def lem_note_rdg(lem)
|
634
|
+
r = ''
|
635
|
+
app = lem.parent
|
636
|
+
@pass << false
|
637
|
+
app.xpath('rdg').each { |rdg|
|
638
|
+
if rdg['wit'].include? @orig
|
639
|
+
s = traverse(rdg, 'back')
|
640
|
+
s = MISSING if s.empty?
|
641
|
+
r += @orig + s
|
642
|
+
end
|
643
|
+
}
|
644
|
+
@pass.pop
|
645
|
+
r += '。' unless r.empty?
|
646
|
+
r
|
647
|
+
end
|
648
|
+
|
649
|
+
def linehead_exist_in_cbeta(s)
|
650
|
+
@xml_root
|
651
|
+
corpus = s[0]
|
652
|
+
if s.match(/^(([A-Z]\d+)n\d+[a-zA-Z]?).*$/)
|
653
|
+
sutra = $1
|
654
|
+
vol = $2
|
655
|
+
path = File.join(@xml_root, corpus, vol, sutra+'.xml')
|
656
|
+
return File.exist? path
|
657
|
+
else
|
658
|
+
return false
|
659
|
+
end
|
660
|
+
end
|
661
|
+
|
662
|
+
def open_xml(fn)
|
663
|
+
s = File.read(fn)
|
664
|
+
|
665
|
+
if fn.include? 'T16n0657'
|
666
|
+
# 這個地方 雙行夾註 跨兩行偈頌
|
667
|
+
# 把 lb 移到 note 結束之前
|
668
|
+
# 讓 lg-row 先結束,再結束雙行夾註
|
669
|
+
s.sub!(/(<\/note>)(\n<lb n="0206b29" ed="T"\/>)/, '\2\1')
|
670
|
+
end
|
671
|
+
|
672
|
+
# <milestone unit="juan"> 前面的 lb 屬於新的這一卷
|
673
|
+
s.gsub!(%r{((?:<pb [^>]+>\n?)?(?:<lb [^>]+>\n?)+)(<milestone [^>]*unit="juan"[^/>]*/>)}, '\2\1')
|
674
|
+
|
675
|
+
doc = Nokogiri::XML(s)
|
676
|
+
doc.remove_namespaces!()
|
677
|
+
doc
|
678
|
+
end
|
679
|
+
|
680
|
+
def read_mod_notes(doc)
|
681
|
+
doc.xpath("//note[@type='mod']").each { |e|
|
682
|
+
@mod_notes << e['n']
|
683
|
+
}
|
684
|
+
end
|
685
|
+
|
686
|
+
def parse_xml(xml_fn)
|
687
|
+
@pass = [false]
|
688
|
+
|
689
|
+
doc = open_xml(xml_fn)
|
690
|
+
|
691
|
+
e = doc.xpath("//titleStmt/title")[0]
|
692
|
+
@title = traverse(e, 'txt')
|
693
|
+
@title = @title.split()[-1]
|
694
|
+
|
695
|
+
read_mod_notes(doc)
|
696
|
+
|
697
|
+
root = doc.root()
|
698
|
+
body = root.xpath("text/body")[0]
|
699
|
+
@pass = [true]
|
700
|
+
|
701
|
+
text = traverse(body)
|
702
|
+
text
|
703
|
+
end
|
704
|
+
|
705
|
+
def to_html(e)
|
706
|
+
e.to_xml(encoding: 'UTF-8', :save_with => Nokogiri::XML::Node::SaveOptions::AS_XML)
|
707
|
+
end
|
708
|
+
|
709
|
+
def traverse(e, mode='html')
|
710
|
+
r = ''
|
711
|
+
e.children.each { |c|
|
712
|
+
s = handle_node(c, mode)
|
713
|
+
r += s
|
714
|
+
}
|
715
|
+
r
|
716
|
+
end
|
717
|
+
|
718
|
+
def write_juan(juan_no, html)
|
719
|
+
if @sutra_no.match(/^(T05|T06|T07)n0220/)
|
720
|
+
work = "T0220"
|
721
|
+
else
|
722
|
+
work = @sutra_no.sub(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
|
723
|
+
end
|
724
|
+
canon = work[0]
|
725
|
+
juan = "%03d" % juan_no
|
726
|
+
folder = File.join(@out_folder, work, juan)
|
727
|
+
FileUtils.remove_dir(folder, force=true)
|
728
|
+
FileUtils.makedirs folder
|
729
|
+
@editions.each do |ed|
|
730
|
+
frag = Nokogiri::HTML.fragment("<div id='body'>#{html}</div>")
|
731
|
+
frag.search("r").each do |node|
|
732
|
+
if node['w'] == ed
|
733
|
+
node.add_previous_sibling node.inner_html
|
734
|
+
end
|
735
|
+
node.remove
|
736
|
+
end
|
737
|
+
text = frag.to_html
|
738
|
+
|
739
|
+
fn = ed.sub(/^【(.*)】$/, '\1') + '.htm'
|
740
|
+
output_path = File.join(folder, fn)
|
741
|
+
text = <<eos
|
742
|
+
<html>
|
743
|
+
<head>
|
744
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
745
|
+
<meta name="filename" content="#{fn}" />
|
746
|
+
<title>#{@title}</title>
|
747
|
+
</head>
|
748
|
+
<body>
|
749
|
+
#{text}
|
750
|
+
</body></html>
|
751
|
+
eos
|
752
|
+
File.write(output_path, text)
|
753
|
+
end
|
754
|
+
end
|
755
|
+
|
756
|
+
end
|