cbeta 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/cbeta.rb +0 -1
  3. data/lib/cbeta/p5a_to_text.rb +90 -13
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 885182f24b433d1627754e78f5cd2cc21729122e
4
- data.tar.gz: e02e81586409357be195a85e195e5f3db289a9d4
3
+ metadata.gz: f10484530e1c47830c305dfd75004aeae5fa24e5
4
+ data.tar.gz: e634ef5b83ce81564c1c9e916370a1b024a8a179
5
5
  SHA512:
6
- metadata.gz: 1b589734bd657cce12e4f20cf1a66b8650cc09a788b2bed3dcef623ffb4a170c82c7e96b2c0211ce0bea543025583cb212fb1dcd779a3087488080ca7f9c4874
7
- data.tar.gz: a839f8c28d5615832edebe4c7a87dd2ce064d0b9fd0ebdf5e8ca0f6211630697b2f4d0f0a2d18dc1305393edc2b531ef303ab6585c3541ed89c6e8c2f92b903f
6
+ metadata.gz: a498957ee9d982b5bde85ddd44a1671297c18acad54ffe2c1b9c48eb196f70a1916d2a7d769ec9c0b7a8a8b65c77f43d60e888df3dbb35fe5236af21eb748574
7
+ data.tar.gz: 173d4d21a6d36088ff57448e2da79b7a9f1bd5a6d5813cd23e06aabacfc39c83637cdb3248bc20b3aa15147b65fe726bff011f58a6a109b8ca2edcd1b360e939
@@ -6,7 +6,6 @@
6
6
  require 'csv'
7
7
 
8
8
  class CBETA
9
-
10
9
  # 將行首資訊轉為引用格式
11
10
  #
12
11
  # @param linehead [String] 行首資訊, 例如:T85n2838_p1291a03
@@ -8,13 +8,21 @@ require 'set'
8
8
  # Convert CBETA XML P5a to Text
9
9
  #
10
10
  # CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
11
+ #
12
+ # @example for convert 大正藏第一冊 in app format:
13
+ #
14
+ # c = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER', 'app')
15
+ # c.convert('T01')
16
+ #
11
17
  class CBETA::P5aToText
12
18
 
13
19
  # @param xml_root [String] 來源 CBETA XML P5a 路徑
14
20
  # @param output_root [String] 輸出 Text 路徑
15
- def initialize(xml_root, output_root)
21
+ # @param format [String] 輸出格式,例:'app'
22
+ def initialize(xml_root, output_root, format=nil)
16
23
  @xml_root = xml_root
17
24
  @output_root = output_root
25
+ @format = format
18
26
  @cbeta = CBETA.new
19
27
  @gaijis = CBETA::Gaiji.new
20
28
 
@@ -61,6 +69,42 @@ class CBETA::P5aToText
61
69
 
62
70
  private
63
71
 
72
+ # 跨行字詞移到下一行
73
+ def appify(text)
74
+ r = ''
75
+ i = 0
76
+ app = ''
77
+ text.each_line do |line|
78
+ line.chomp!
79
+ if line.match(/^(.*)║(.*)$/)
80
+ r += $1
81
+ t = $2
82
+ r += "(%02d)" % i
83
+ r += "║#{app}"
84
+ app = ''
85
+ i = 0
86
+ chars = t.chars
87
+ until chars.empty?
88
+ c = chars.pop
89
+ if c == "\t"
90
+ break
91
+ elsif '  :》」』、;,!?。'.include? c
92
+ chars << c
93
+ break
94
+ elsif '《「『'.include? c # 這些標點移到下一行
95
+ app = c + app
96
+ break
97
+ else
98
+ app = c + app
99
+ end
100
+ end
101
+ r += chars.join.gsub(/\t/, '') + "\n"
102
+ i = app.size
103
+ end
104
+ end
105
+ r
106
+ end
107
+
64
108
  def convert_all
65
109
  Dir.foreach(@xml_root) { |c|
66
110
  next unless c.match(/^[A-Z]$/)
@@ -83,11 +127,15 @@ class CBETA::P5aToText
83
127
  end
84
128
 
85
129
  def handle_byline(e)
86
- traverse(e) + "\n"
130
+ r = traverse(e)
131
+ r += @format=='app' ? "\t" : "\n"
132
+ r
87
133
  end
88
134
 
89
135
  def handle_cell(e)
90
- traverse(e) + "\n"
136
+ r = traverse(e)
137
+ r += @format=='app' ? "\t" : "\n"
138
+ r
91
139
  end
92
140
 
93
141
  def handle_collection(c)
@@ -109,11 +157,15 @@ class CBETA::P5aToText
109
157
  end
110
158
 
111
159
  def handle_docNumber(e)
112
- traverse(e) + "\n"
160
+ r = traverse(e)
161
+ r += @format == 'app' ? "\t" : "\n"
162
+ r
113
163
  end
114
164
 
115
165
  def handle_figure(e)
116
- traverse(e) + "\n"
166
+ r = traverse(e)
167
+ r += @format == 'app' ? "\t" : "\n"
168
+ r
117
169
  end
118
170
 
119
171
  def handle_g(e)
@@ -155,27 +207,37 @@ class CBETA::P5aToText
155
207
  end
156
208
 
157
209
  def handle_head(e)
158
- traverse(e) + "\n"
210
+ r = traverse(e)
211
+ r += @format == 'app' ? "\t" : "\n"
212
+ r
159
213
  end
160
214
 
161
215
  def handle_item(e)
162
- traverse(e) + "\n"
216
+ r = traverse(e)
217
+ r += @format == 'app' ? "\t" : "\n"
163
218
  end
164
219
 
165
220
  def handle_juan(e)
166
- traverse(e) + "\n"
221
+ r = traverse(e)
222
+ r += @format == 'app' ? "\t" : "\n"
223
+ r
167
224
  end
168
225
 
169
226
  def handle_l(e)
170
227
  r = traverse(e)
171
- unless @lg_type == 'abnormal'
172
- r += "\n"
228
+ if @format == 'app'
229
+ r += "\t"
230
+ else
231
+ r += "\n" unless @lg_type == 'abnormal'
173
232
  end
174
233
  r
175
234
  end
176
235
 
177
236
  def handle_lb(e)
178
237
  r = ''
238
+ if @format == 'app'
239
+ r += "\n#{e['n']}║"
240
+ end
179
241
  unless @next_line_buf.empty?
180
242
  r += @next_line_buf + "\n"
181
243
  @next_line_buf = ''
@@ -197,7 +259,9 @@ class CBETA::P5aToText
197
259
  end
198
260
 
199
261
  def handle_list(e)
200
- "\n" + traverse(e)
262
+ r = ''
263
+ r += "\n" unless @format == 'app'
264
+ r + traverse(e)
201
265
  end
202
266
 
203
267
  def handle_milestone(e)
@@ -247,6 +311,7 @@ class CBETA::P5aToText
247
311
  when 'row' then handle_row(e)
248
312
  when 'sic' then handle_sic(e)
249
313
  when 'sg' then handle_sg(e)
314
+ when 'tt' then handle_tt(e)
250
315
  when 't' then handle_t(e)
251
316
  when 'table' then handle_table(e)
252
317
  when 'teiHeader' then ''
@@ -264,7 +329,9 @@ class CBETA::P5aToText
264
329
  end
265
330
 
266
331
  def handle_p(e)
267
- traverse(e) + "\n"
332
+ r = traverse(e)
333
+ r += @format == 'app' ? "\t" : "\n"
334
+ r
268
335
  end
269
336
 
270
337
  def handle_rdg(e)
@@ -337,6 +404,9 @@ class CBETA::P5aToText
337
404
  end
338
405
  r = traverse(e)
339
406
 
407
+ # 不是雙行對照
408
+ return r if @tt_type == 'app'
409
+
340
410
  # 處理雙行對照
341
411
  i = e.xpath('../t').index(e)
342
412
  case i
@@ -366,6 +436,11 @@ class CBETA::P5aToText
366
436
  CGI.escapeHTML(r)
367
437
  end
368
438
 
439
+ def handle_tt(e)
440
+ @tt_type = e['type']
441
+ traverse(e)
442
+ end
443
+
369
444
  def handle_vol(vol)
370
445
  puts "convert volumn: #{vol}"
371
446
 
@@ -427,13 +502,15 @@ class CBETA::P5aToText
427
502
  node.remove
428
503
  end
429
504
  end
505
+ text = frag.content
506
+ text = appify(text) if @format == 'app'
430
507
 
431
508
  folder = File.join(@out_sutra, ed)
432
509
  FileUtils.makedirs(folder)
433
510
 
434
511
  fn = "#{@sutra_no}_%03d.txt" % juan_no
435
512
  output_path = File.join(folder, fn)
436
- File.write(output_path, frag.content)
513
+ File.write(output_path, text)
437
514
  end
438
515
  end
439
516
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-13 00:00:00.000000000 Z
11
+ date: 2015-07-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
14
14
  email: zhoubx@gmail.com