cbeta 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +0 -1
- data/lib/cbeta/p5a_to_text.rb +90 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f10484530e1c47830c305dfd75004aeae5fa24e5
|
4
|
+
data.tar.gz: e634ef5b83ce81564c1c9e916370a1b024a8a179
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a498957ee9d982b5bde85ddd44a1671297c18acad54ffe2c1b9c48eb196f70a1916d2a7d769ec9c0b7a8a8b65c77f43d60e888df3dbb35fe5236af21eb748574
|
7
|
+
data.tar.gz: 173d4d21a6d36088ff57448e2da79b7a9f1bd5a6d5813cd23e06aabacfc39c83637cdb3248bc20b3aa15147b65fe726bff011f58a6a109b8ca2edcd1b360e939
|
data/lib/cbeta.rb
CHANGED
data/lib/cbeta/p5a_to_text.rb
CHANGED
@@ -8,13 +8,21 @@ require 'set'
|
|
8
8
|
# Convert CBETA XML P5a to Text
|
9
9
|
#
|
10
10
|
# CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
|
11
|
+
#
|
12
|
+
# @example for convert 大正藏第一冊 in app format:
|
13
|
+
#
|
14
|
+
# c = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER', 'app')
|
15
|
+
# c.convert('T01')
|
16
|
+
#
|
11
17
|
class CBETA::P5aToText
|
12
18
|
|
13
19
|
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
14
20
|
# @param output_root [String] 輸出 Text 路徑
|
15
|
-
|
21
|
+
# @param format [String] 輸出格式,例:'app'
|
22
|
+
def initialize(xml_root, output_root, format=nil)
|
16
23
|
@xml_root = xml_root
|
17
24
|
@output_root = output_root
|
25
|
+
@format = format
|
18
26
|
@cbeta = CBETA.new
|
19
27
|
@gaijis = CBETA::Gaiji.new
|
20
28
|
|
@@ -61,6 +69,42 @@ class CBETA::P5aToText
|
|
61
69
|
|
62
70
|
private
|
63
71
|
|
72
|
+
# 跨行字詞移到下一行
|
73
|
+
def appify(text)
|
74
|
+
r = ''
|
75
|
+
i = 0
|
76
|
+
app = ''
|
77
|
+
text.each_line do |line|
|
78
|
+
line.chomp!
|
79
|
+
if line.match(/^(.*)║(.*)$/)
|
80
|
+
r += $1
|
81
|
+
t = $2
|
82
|
+
r += "(%02d)" % i
|
83
|
+
r += "║#{app}"
|
84
|
+
app = ''
|
85
|
+
i = 0
|
86
|
+
chars = t.chars
|
87
|
+
until chars.empty?
|
88
|
+
c = chars.pop
|
89
|
+
if c == "\t"
|
90
|
+
break
|
91
|
+
elsif ' :》」』、;,!?。'.include? c
|
92
|
+
chars << c
|
93
|
+
break
|
94
|
+
elsif '《「『'.include? c # 這些標點移到下一行
|
95
|
+
app = c + app
|
96
|
+
break
|
97
|
+
else
|
98
|
+
app = c + app
|
99
|
+
end
|
100
|
+
end
|
101
|
+
r += chars.join.gsub(/\t/, '') + "\n"
|
102
|
+
i = app.size
|
103
|
+
end
|
104
|
+
end
|
105
|
+
r
|
106
|
+
end
|
107
|
+
|
64
108
|
def convert_all
|
65
109
|
Dir.foreach(@xml_root) { |c|
|
66
110
|
next unless c.match(/^[A-Z]$/)
|
@@ -83,11 +127,15 @@ class CBETA::P5aToText
|
|
83
127
|
end
|
84
128
|
|
85
129
|
def handle_byline(e)
|
86
|
-
traverse(e)
|
130
|
+
r = traverse(e)
|
131
|
+
r += @format=='app' ? "\t" : "\n"
|
132
|
+
r
|
87
133
|
end
|
88
134
|
|
89
135
|
def handle_cell(e)
|
90
|
-
traverse(e)
|
136
|
+
r = traverse(e)
|
137
|
+
r += @format=='app' ? "\t" : "\n"
|
138
|
+
r
|
91
139
|
end
|
92
140
|
|
93
141
|
def handle_collection(c)
|
@@ -109,11 +157,15 @@ class CBETA::P5aToText
|
|
109
157
|
end
|
110
158
|
|
111
159
|
def handle_docNumber(e)
|
112
|
-
traverse(e)
|
160
|
+
r = traverse(e)
|
161
|
+
r += @format == 'app' ? "\t" : "\n"
|
162
|
+
r
|
113
163
|
end
|
114
164
|
|
115
165
|
def handle_figure(e)
|
116
|
-
traverse(e)
|
166
|
+
r = traverse(e)
|
167
|
+
r += @format == 'app' ? "\t" : "\n"
|
168
|
+
r
|
117
169
|
end
|
118
170
|
|
119
171
|
def handle_g(e)
|
@@ -155,27 +207,37 @@ class CBETA::P5aToText
|
|
155
207
|
end
|
156
208
|
|
157
209
|
def handle_head(e)
|
158
|
-
traverse(e)
|
210
|
+
r = traverse(e)
|
211
|
+
r += @format == 'app' ? "\t" : "\n"
|
212
|
+
r
|
159
213
|
end
|
160
214
|
|
161
215
|
def handle_item(e)
|
162
|
-
traverse(e)
|
216
|
+
r = traverse(e)
|
217
|
+
r += @format == 'app' ? "\t" : "\n"
|
163
218
|
end
|
164
219
|
|
165
220
|
def handle_juan(e)
|
166
|
-
traverse(e)
|
221
|
+
r = traverse(e)
|
222
|
+
r += @format == 'app' ? "\t" : "\n"
|
223
|
+
r
|
167
224
|
end
|
168
225
|
|
169
226
|
def handle_l(e)
|
170
227
|
r = traverse(e)
|
171
|
-
|
172
|
-
r += "\
|
228
|
+
if @format == 'app'
|
229
|
+
r += "\t"
|
230
|
+
else
|
231
|
+
r += "\n" unless @lg_type == 'abnormal'
|
173
232
|
end
|
174
233
|
r
|
175
234
|
end
|
176
235
|
|
177
236
|
def handle_lb(e)
|
178
237
|
r = ''
|
238
|
+
if @format == 'app'
|
239
|
+
r += "\n#{e['n']}║"
|
240
|
+
end
|
179
241
|
unless @next_line_buf.empty?
|
180
242
|
r += @next_line_buf + "\n"
|
181
243
|
@next_line_buf = ''
|
@@ -197,7 +259,9 @@ class CBETA::P5aToText
|
|
197
259
|
end
|
198
260
|
|
199
261
|
def handle_list(e)
|
200
|
-
|
262
|
+
r = ''
|
263
|
+
r += "\n" unless @format == 'app'
|
264
|
+
r + traverse(e)
|
201
265
|
end
|
202
266
|
|
203
267
|
def handle_milestone(e)
|
@@ -247,6 +311,7 @@ class CBETA::P5aToText
|
|
247
311
|
when 'row' then handle_row(e)
|
248
312
|
when 'sic' then handle_sic(e)
|
249
313
|
when 'sg' then handle_sg(e)
|
314
|
+
when 'tt' then handle_tt(e)
|
250
315
|
when 't' then handle_t(e)
|
251
316
|
when 'table' then handle_table(e)
|
252
317
|
when 'teiHeader' then ''
|
@@ -264,7 +329,9 @@ class CBETA::P5aToText
|
|
264
329
|
end
|
265
330
|
|
266
331
|
def handle_p(e)
|
267
|
-
traverse(e)
|
332
|
+
r = traverse(e)
|
333
|
+
r += @format == 'app' ? "\t" : "\n"
|
334
|
+
r
|
268
335
|
end
|
269
336
|
|
270
337
|
def handle_rdg(e)
|
@@ -337,6 +404,9 @@ class CBETA::P5aToText
|
|
337
404
|
end
|
338
405
|
r = traverse(e)
|
339
406
|
|
407
|
+
# 不是雙行對照
|
408
|
+
return r if @tt_type == 'app'
|
409
|
+
|
340
410
|
# 處理雙行對照
|
341
411
|
i = e.xpath('../t').index(e)
|
342
412
|
case i
|
@@ -366,6 +436,11 @@ class CBETA::P5aToText
|
|
366
436
|
CGI.escapeHTML(r)
|
367
437
|
end
|
368
438
|
|
439
|
+
def handle_tt(e)
|
440
|
+
@tt_type = e['type']
|
441
|
+
traverse(e)
|
442
|
+
end
|
443
|
+
|
369
444
|
def handle_vol(vol)
|
370
445
|
puts "convert volumn: #{vol}"
|
371
446
|
|
@@ -427,13 +502,15 @@ class CBETA::P5aToText
|
|
427
502
|
node.remove
|
428
503
|
end
|
429
504
|
end
|
505
|
+
text = frag.content
|
506
|
+
text = appify(text) if @format == 'app'
|
430
507
|
|
431
508
|
folder = File.join(@out_sutra, ed)
|
432
509
|
FileUtils.makedirs(folder)
|
433
510
|
|
434
511
|
fn = "#{@sutra_no}_%03d.txt" % juan_no
|
435
512
|
output_path = File.join(folder, fn)
|
436
|
-
File.write(output_path,
|
513
|
+
File.write(output_path, text)
|
437
514
|
end
|
438
515
|
end
|
439
516
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-07-
|
11
|
+
date: 2015-07-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|