cbeta 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +0 -1
- data/lib/cbeta/p5a_to_text.rb +90 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f10484530e1c47830c305dfd75004aeae5fa24e5
|
4
|
+
data.tar.gz: e634ef5b83ce81564c1c9e916370a1b024a8a179
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a498957ee9d982b5bde85ddd44a1671297c18acad54ffe2c1b9c48eb196f70a1916d2a7d769ec9c0b7a8a8b65c77f43d60e888df3dbb35fe5236af21eb748574
|
7
|
+
data.tar.gz: 173d4d21a6d36088ff57448e2da79b7a9f1bd5a6d5813cd23e06aabacfc39c83637cdb3248bc20b3aa15147b65fe726bff011f58a6a109b8ca2edcd1b360e939
|
data/lib/cbeta.rb
CHANGED
data/lib/cbeta/p5a_to_text.rb
CHANGED
@@ -8,13 +8,21 @@ require 'set'
|
|
8
8
|
# Convert CBETA XML P5a to Text
|
9
9
|
#
|
10
10
|
# CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
|
11
|
+
#
|
12
|
+
# @example for convert 大正藏第一冊 in app format:
|
13
|
+
#
|
14
|
+
# c = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER', 'app')
|
15
|
+
# c.convert('T01')
|
16
|
+
#
|
11
17
|
class CBETA::P5aToText
|
12
18
|
|
13
19
|
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
14
20
|
# @param output_root [String] 輸出 Text 路徑
|
15
|
-
|
21
|
+
# @param format [String] 輸出格式,例:'app'
|
22
|
+
def initialize(xml_root, output_root, format=nil)
|
16
23
|
@xml_root = xml_root
|
17
24
|
@output_root = output_root
|
25
|
+
@format = format
|
18
26
|
@cbeta = CBETA.new
|
19
27
|
@gaijis = CBETA::Gaiji.new
|
20
28
|
|
@@ -61,6 +69,42 @@ class CBETA::P5aToText
|
|
61
69
|
|
62
70
|
private
|
63
71
|
|
72
|
+
# 跨行字詞移到下一行
|
73
|
+
def appify(text)
|
74
|
+
r = ''
|
75
|
+
i = 0
|
76
|
+
app = ''
|
77
|
+
text.each_line do |line|
|
78
|
+
line.chomp!
|
79
|
+
if line.match(/^(.*)║(.*)$/)
|
80
|
+
r += $1
|
81
|
+
t = $2
|
82
|
+
r += "(%02d)" % i
|
83
|
+
r += "║#{app}"
|
84
|
+
app = ''
|
85
|
+
i = 0
|
86
|
+
chars = t.chars
|
87
|
+
until chars.empty?
|
88
|
+
c = chars.pop
|
89
|
+
if c == "\t"
|
90
|
+
break
|
91
|
+
elsif ' :》」』、;,!?。'.include? c
|
92
|
+
chars << c
|
93
|
+
break
|
94
|
+
elsif '《「『'.include? c # 這些標點移到下一行
|
95
|
+
app = c + app
|
96
|
+
break
|
97
|
+
else
|
98
|
+
app = c + app
|
99
|
+
end
|
100
|
+
end
|
101
|
+
r += chars.join.gsub(/\t/, '') + "\n"
|
102
|
+
i = app.size
|
103
|
+
end
|
104
|
+
end
|
105
|
+
r
|
106
|
+
end
|
107
|
+
|
64
108
|
def convert_all
|
65
109
|
Dir.foreach(@xml_root) { |c|
|
66
110
|
next unless c.match(/^[A-Z]$/)
|
@@ -83,11 +127,15 @@ class CBETA::P5aToText
|
|
83
127
|
end
|
84
128
|
|
85
129
|
def handle_byline(e)
|
86
|
-
traverse(e)
|
130
|
+
r = traverse(e)
|
131
|
+
r += @format=='app' ? "\t" : "\n"
|
132
|
+
r
|
87
133
|
end
|
88
134
|
|
89
135
|
def handle_cell(e)
|
90
|
-
traverse(e)
|
136
|
+
r = traverse(e)
|
137
|
+
r += @format=='app' ? "\t" : "\n"
|
138
|
+
r
|
91
139
|
end
|
92
140
|
|
93
141
|
def handle_collection(c)
|
@@ -109,11 +157,15 @@ class CBETA::P5aToText
|
|
109
157
|
end
|
110
158
|
|
111
159
|
def handle_docNumber(e)
|
112
|
-
traverse(e)
|
160
|
+
r = traverse(e)
|
161
|
+
r += @format == 'app' ? "\t" : "\n"
|
162
|
+
r
|
113
163
|
end
|
114
164
|
|
115
165
|
def handle_figure(e)
|
116
|
-
traverse(e)
|
166
|
+
r = traverse(e)
|
167
|
+
r += @format == 'app' ? "\t" : "\n"
|
168
|
+
r
|
117
169
|
end
|
118
170
|
|
119
171
|
def handle_g(e)
|
@@ -155,27 +207,37 @@ class CBETA::P5aToText
|
|
155
207
|
end
|
156
208
|
|
157
209
|
def handle_head(e)
|
158
|
-
traverse(e)
|
210
|
+
r = traverse(e)
|
211
|
+
r += @format == 'app' ? "\t" : "\n"
|
212
|
+
r
|
159
213
|
end
|
160
214
|
|
161
215
|
def handle_item(e)
|
162
|
-
traverse(e)
|
216
|
+
r = traverse(e)
|
217
|
+
r += @format == 'app' ? "\t" : "\n"
|
163
218
|
end
|
164
219
|
|
165
220
|
def handle_juan(e)
|
166
|
-
traverse(e)
|
221
|
+
r = traverse(e)
|
222
|
+
r += @format == 'app' ? "\t" : "\n"
|
223
|
+
r
|
167
224
|
end
|
168
225
|
|
169
226
|
def handle_l(e)
|
170
227
|
r = traverse(e)
|
171
|
-
|
172
|
-
r += "\
|
228
|
+
if @format == 'app'
|
229
|
+
r += "\t"
|
230
|
+
else
|
231
|
+
r += "\n" unless @lg_type == 'abnormal'
|
173
232
|
end
|
174
233
|
r
|
175
234
|
end
|
176
235
|
|
177
236
|
def handle_lb(e)
|
178
237
|
r = ''
|
238
|
+
if @format == 'app'
|
239
|
+
r += "\n#{e['n']}║"
|
240
|
+
end
|
179
241
|
unless @next_line_buf.empty?
|
180
242
|
r += @next_line_buf + "\n"
|
181
243
|
@next_line_buf = ''
|
@@ -197,7 +259,9 @@ class CBETA::P5aToText
|
|
197
259
|
end
|
198
260
|
|
199
261
|
def handle_list(e)
|
200
|
-
|
262
|
+
r = ''
|
263
|
+
r += "\n" unless @format == 'app'
|
264
|
+
r + traverse(e)
|
201
265
|
end
|
202
266
|
|
203
267
|
def handle_milestone(e)
|
@@ -247,6 +311,7 @@ class CBETA::P5aToText
|
|
247
311
|
when 'row' then handle_row(e)
|
248
312
|
when 'sic' then handle_sic(e)
|
249
313
|
when 'sg' then handle_sg(e)
|
314
|
+
when 'tt' then handle_tt(e)
|
250
315
|
when 't' then handle_t(e)
|
251
316
|
when 'table' then handle_table(e)
|
252
317
|
when 'teiHeader' then ''
|
@@ -264,7 +329,9 @@ class CBETA::P5aToText
|
|
264
329
|
end
|
265
330
|
|
266
331
|
def handle_p(e)
|
267
|
-
traverse(e)
|
332
|
+
r = traverse(e)
|
333
|
+
r += @format == 'app' ? "\t" : "\n"
|
334
|
+
r
|
268
335
|
end
|
269
336
|
|
270
337
|
def handle_rdg(e)
|
@@ -337,6 +404,9 @@ class CBETA::P5aToText
|
|
337
404
|
end
|
338
405
|
r = traverse(e)
|
339
406
|
|
407
|
+
# 不是雙行對照
|
408
|
+
return r if @tt_type == 'app'
|
409
|
+
|
340
410
|
# 處理雙行對照
|
341
411
|
i = e.xpath('../t').index(e)
|
342
412
|
case i
|
@@ -366,6 +436,11 @@ class CBETA::P5aToText
|
|
366
436
|
CGI.escapeHTML(r)
|
367
437
|
end
|
368
438
|
|
439
|
+
def handle_tt(e)
|
440
|
+
@tt_type = e['type']
|
441
|
+
traverse(e)
|
442
|
+
end
|
443
|
+
|
369
444
|
def handle_vol(vol)
|
370
445
|
puts "convert volumn: #{vol}"
|
371
446
|
|
@@ -427,13 +502,15 @@ class CBETA::P5aToText
|
|
427
502
|
node.remove
|
428
503
|
end
|
429
504
|
end
|
505
|
+
text = frag.content
|
506
|
+
text = appify(text) if @format == 'app'
|
430
507
|
|
431
508
|
folder = File.join(@out_sutra, ed)
|
432
509
|
FileUtils.makedirs(folder)
|
433
510
|
|
434
511
|
fn = "#{@sutra_no}_%03d.txt" % juan_no
|
435
512
|
output_path = File.join(folder, fn)
|
436
|
-
File.write(output_path,
|
513
|
+
File.write(output_path, text)
|
437
514
|
end
|
438
515
|
end
|
439
516
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-07-
|
11
|
+
date: 2015-07-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|