cbeta 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/cbeta.rb +0 -1
  3. data/lib/cbeta/p5a_to_text.rb +90 -13
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 885182f24b433d1627754e78f5cd2cc21729122e
4
- data.tar.gz: e02e81586409357be195a85e195e5f3db289a9d4
3
+ metadata.gz: f10484530e1c47830c305dfd75004aeae5fa24e5
4
+ data.tar.gz: e634ef5b83ce81564c1c9e916370a1b024a8a179
5
5
  SHA512:
6
- metadata.gz: 1b589734bd657cce12e4f20cf1a66b8650cc09a788b2bed3dcef623ffb4a170c82c7e96b2c0211ce0bea543025583cb212fb1dcd779a3087488080ca7f9c4874
7
- data.tar.gz: a839f8c28d5615832edebe4c7a87dd2ce064d0b9fd0ebdf5e8ca0f6211630697b2f4d0f0a2d18dc1305393edc2b531ef303ab6585c3541ed89c6e8c2f92b903f
6
+ metadata.gz: a498957ee9d982b5bde85ddd44a1671297c18acad54ffe2c1b9c48eb196f70a1916d2a7d769ec9c0b7a8a8b65c77f43d60e888df3dbb35fe5236af21eb748574
7
+ data.tar.gz: 173d4d21a6d36088ff57448e2da79b7a9f1bd5a6d5813cd23e06aabacfc39c83637cdb3248bc20b3aa15147b65fe726bff011f58a6a109b8ca2edcd1b360e939
@@ -6,7 +6,6 @@
6
6
  require 'csv'
7
7
 
8
8
  class CBETA
9
-
10
9
  # 將行首資訊轉為引用格式
11
10
  #
12
11
  # @param linehead [String] 行首資訊, 例如:T85n2838_p1291a03
@@ -8,13 +8,21 @@ require 'set'
8
8
  # Convert CBETA XML P5a to Text
9
9
  #
10
10
  # CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
11
+ #
12
+ # @example for convert 大正藏第一冊 in app format:
13
+ #
14
+ # c = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER', 'app')
15
+ # c.convert('T01')
16
+ #
11
17
  class CBETA::P5aToText
12
18
 
13
19
  # @param xml_root [String] 來源 CBETA XML P5a 路徑
14
20
  # @param output_root [String] 輸出 Text 路徑
15
- def initialize(xml_root, output_root)
21
+ # @param format [String] 輸出格式,例:'app'
22
+ def initialize(xml_root, output_root, format=nil)
16
23
  @xml_root = xml_root
17
24
  @output_root = output_root
25
+ @format = format
18
26
  @cbeta = CBETA.new
19
27
  @gaijis = CBETA::Gaiji.new
20
28
 
@@ -61,6 +69,42 @@ class CBETA::P5aToText
61
69
 
62
70
  private
63
71
 
72
+ # 跨行字詞移到下一行
73
+ def appify(text)
74
+ r = ''
75
+ i = 0
76
+ app = ''
77
+ text.each_line do |line|
78
+ line.chomp!
79
+ if line.match(/^(.*)║(.*)$/)
80
+ r += $1
81
+ t = $2
82
+ r += "(%02d)" % i
83
+ r += "║#{app}"
84
+ app = ''
85
+ i = 0
86
+ chars = t.chars
87
+ until chars.empty?
88
+ c = chars.pop
89
+ if c == "\t"
90
+ break
91
+ elsif '  :》」』、;,!?。'.include? c
92
+ chars << c
93
+ break
94
+ elsif '《「『'.include? c # 這些標點移到下一行
95
+ app = c + app
96
+ break
97
+ else
98
+ app = c + app
99
+ end
100
+ end
101
+ r += chars.join.gsub(/\t/, '') + "\n"
102
+ i = app.size
103
+ end
104
+ end
105
+ r
106
+ end
107
+
64
108
  def convert_all
65
109
  Dir.foreach(@xml_root) { |c|
66
110
  next unless c.match(/^[A-Z]$/)
@@ -83,11 +127,15 @@ class CBETA::P5aToText
83
127
  end
84
128
 
85
129
  def handle_byline(e)
86
- traverse(e) + "\n"
130
+ r = traverse(e)
131
+ r += @format=='app' ? "\t" : "\n"
132
+ r
87
133
  end
88
134
 
89
135
  def handle_cell(e)
90
- traverse(e) + "\n"
136
+ r = traverse(e)
137
+ r += @format=='app' ? "\t" : "\n"
138
+ r
91
139
  end
92
140
 
93
141
  def handle_collection(c)
@@ -109,11 +157,15 @@ class CBETA::P5aToText
109
157
  end
110
158
 
111
159
  def handle_docNumber(e)
112
- traverse(e) + "\n"
160
+ r = traverse(e)
161
+ r += @format == 'app' ? "\t" : "\n"
162
+ r
113
163
  end
114
164
 
115
165
  def handle_figure(e)
116
- traverse(e) + "\n"
166
+ r = traverse(e)
167
+ r += @format == 'app' ? "\t" : "\n"
168
+ r
117
169
  end
118
170
 
119
171
  def handle_g(e)
@@ -155,27 +207,37 @@ class CBETA::P5aToText
155
207
  end
156
208
 
157
209
  def handle_head(e)
158
- traverse(e) + "\n"
210
+ r = traverse(e)
211
+ r += @format == 'app' ? "\t" : "\n"
212
+ r
159
213
  end
160
214
 
161
215
  def handle_item(e)
162
- traverse(e) + "\n"
216
+ r = traverse(e)
217
+ r += @format == 'app' ? "\t" : "\n"
163
218
  end
164
219
 
165
220
  def handle_juan(e)
166
- traverse(e) + "\n"
221
+ r = traverse(e)
222
+ r += @format == 'app' ? "\t" : "\n"
223
+ r
167
224
  end
168
225
 
169
226
  def handle_l(e)
170
227
  r = traverse(e)
171
- unless @lg_type == 'abnormal'
172
- r += "\n"
228
+ if @format == 'app'
229
+ r += "\t"
230
+ else
231
+ r += "\n" unless @lg_type == 'abnormal'
173
232
  end
174
233
  r
175
234
  end
176
235
 
177
236
  def handle_lb(e)
178
237
  r = ''
238
+ if @format == 'app'
239
+ r += "\n#{e['n']}║"
240
+ end
179
241
  unless @next_line_buf.empty?
180
242
  r += @next_line_buf + "\n"
181
243
  @next_line_buf = ''
@@ -197,7 +259,9 @@ class CBETA::P5aToText
197
259
  end
198
260
 
199
261
  def handle_list(e)
200
- "\n" + traverse(e)
262
+ r = ''
263
+ r += "\n" unless @format == 'app'
264
+ r + traverse(e)
201
265
  end
202
266
 
203
267
  def handle_milestone(e)
@@ -247,6 +311,7 @@ class CBETA::P5aToText
247
311
  when 'row' then handle_row(e)
248
312
  when 'sic' then handle_sic(e)
249
313
  when 'sg' then handle_sg(e)
314
+ when 'tt' then handle_tt(e)
250
315
  when 't' then handle_t(e)
251
316
  when 'table' then handle_table(e)
252
317
  when 'teiHeader' then ''
@@ -264,7 +329,9 @@ class CBETA::P5aToText
264
329
  end
265
330
 
266
331
  def handle_p(e)
267
- traverse(e) + "\n"
332
+ r = traverse(e)
333
+ r += @format == 'app' ? "\t" : "\n"
334
+ r
268
335
  end
269
336
 
270
337
  def handle_rdg(e)
@@ -337,6 +404,9 @@ class CBETA::P5aToText
337
404
  end
338
405
  r = traverse(e)
339
406
 
407
+ # 不是雙行對照
408
+ return r if @tt_type == 'app'
409
+
340
410
  # 處理雙行對照
341
411
  i = e.xpath('../t').index(e)
342
412
  case i
@@ -366,6 +436,11 @@ class CBETA::P5aToText
366
436
  CGI.escapeHTML(r)
367
437
  end
368
438
 
439
+ def handle_tt(e)
440
+ @tt_type = e['type']
441
+ traverse(e)
442
+ end
443
+
369
444
  def handle_vol(vol)
370
445
  puts "convert volumn: #{vol}"
371
446
 
@@ -427,13 +502,15 @@ class CBETA::P5aToText
427
502
  node.remove
428
503
  end
429
504
  end
505
+ text = frag.content
506
+ text = appify(text) if @format == 'app'
430
507
 
431
508
  folder = File.join(@out_sutra, ed)
432
509
  FileUtils.makedirs(folder)
433
510
 
434
511
  fn = "#{@sutra_no}_%03d.txt" % juan_no
435
512
  output_path = File.join(folder, fn)
436
- File.write(output_path, frag.content)
513
+ File.write(output_path, text)
437
514
  end
438
515
  end
439
516
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-13 00:00:00.000000000 Z
11
+ date: 2015-07-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
14
14
  email: zhoubx@gmail.com