cbeta 3.1.3 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta/gaiji.rb +2 -0
- data/lib/cbeta/xml_document.rb +300 -0
- data/lib/cbeta.rb +1 -0
- data/lib/data/cbeta_gaiji.json +12 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 982e4d10689bc1fa6f71d8c27e880cbfdcf248c5cbdb8994ad65bf6d2c5e5259
|
4
|
+
data.tar.gz: 7c6cfa28c5f48f1f84bb4ad2921e45ac1db1f2039626999a1fd620bbe91c9564
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b1ac00d3090b9a19df3e3d25d4cf1c2dc4f50a37e6de3cbbdf4b09a9cd6cb68b1fa7700e825cad8352628f7a9bab8ed34a916abbd4d8380a68d5783e6aef8513
|
7
|
+
data.tar.gz: b1a6f2ebb218737938ae7bc35eb5bc648cc544c8ac964647054335bf15f218e17db7a337e8b529060eccd69094872df6725ebed18b0e7cb0681253f3eccdf23d
|
data/lib/cbeta/gaiji.rb
CHANGED
@@ -0,0 +1,300 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class CBETA::XMLDocument
|
4
|
+
PASS = %w(back graphic mulu rdg sic teiHeader)
|
5
|
+
|
6
|
+
def initialize(string_or_io)
|
7
|
+
@doc = Nokogiri::XML(string_or_io)
|
8
|
+
@doc.remove_namespaces!
|
9
|
+
@gaiji = CBETA::Gaiji.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_text
|
13
|
+
@format = 'text'
|
14
|
+
@gaiji_norm = [true]
|
15
|
+
@next_line_buf = ''
|
16
|
+
traverse(@doc.root)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def e_anchor(e)
|
22
|
+
if e.has_attribute?('type')
|
23
|
+
if e['type'] == 'circle'
|
24
|
+
return '◎'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
''
|
29
|
+
end
|
30
|
+
|
31
|
+
def e_app(e)
|
32
|
+
traverse(e)
|
33
|
+
end
|
34
|
+
|
35
|
+
def e_body(e)
|
36
|
+
traverse(e)
|
37
|
+
end
|
38
|
+
|
39
|
+
def e_byline(e)
|
40
|
+
traverse(e) + "\n"
|
41
|
+
end
|
42
|
+
|
43
|
+
def e_caesura(e)
|
44
|
+
' '
|
45
|
+
end
|
46
|
+
|
47
|
+
def e_cell(e)
|
48
|
+
traverse(e) + "\n"
|
49
|
+
end
|
50
|
+
|
51
|
+
def e_corr(e)
|
52
|
+
traverse(e)
|
53
|
+
end
|
54
|
+
|
55
|
+
def e_date(e)
|
56
|
+
traverse(e)
|
57
|
+
end
|
58
|
+
|
59
|
+
def e_dialog(e)
|
60
|
+
traverse(e)
|
61
|
+
end
|
62
|
+
|
63
|
+
def e_div(e)
|
64
|
+
traverse(e)
|
65
|
+
end
|
66
|
+
|
67
|
+
def e_docNumber(e)
|
68
|
+
traverse(e) + "\n"
|
69
|
+
end
|
70
|
+
|
71
|
+
def e_event(e)
|
72
|
+
traverse(e) + "\n"
|
73
|
+
end
|
74
|
+
|
75
|
+
def e_figure(e)
|
76
|
+
traverse(e) + "\n"
|
77
|
+
end
|
78
|
+
|
79
|
+
def e_foreign(e)
|
80
|
+
return '' if e.key?('place') and e['place'].include?('foot')
|
81
|
+
traverse(e)
|
82
|
+
end
|
83
|
+
|
84
|
+
def e_g(e)
|
85
|
+
if @gaiji_norm.last
|
86
|
+
cb_priority = %w(uni_char norm_uni_char norm_big5_char composition)
|
87
|
+
else
|
88
|
+
cb_priority = %w(uni_char composition)
|
89
|
+
end
|
90
|
+
|
91
|
+
gid = e['ref'][1..-1]
|
92
|
+
r = @gaiji.to_s(gid, cb_priority:)
|
93
|
+
abort "Line:#{__LINE__} 缺字處理失敗:#{gid}" if r.nil?
|
94
|
+
r
|
95
|
+
end
|
96
|
+
|
97
|
+
def e_head(e)
|
98
|
+
traverse(e) + "\n"
|
99
|
+
end
|
100
|
+
|
101
|
+
def e_hi(e)
|
102
|
+
traverse(e)
|
103
|
+
end
|
104
|
+
|
105
|
+
def e_item(e)
|
106
|
+
r = "\n"
|
107
|
+
|
108
|
+
list_level = e.xpath('ancestor::list').size
|
109
|
+
r << ' ' * (list_level - 1)
|
110
|
+
r << traverse(e)
|
111
|
+
if e.key? 'n'
|
112
|
+
r = e['n'] + r
|
113
|
+
end
|
114
|
+
r
|
115
|
+
end
|
116
|
+
|
117
|
+
def e_jhead(e)
|
118
|
+
traverse(e)
|
119
|
+
end
|
120
|
+
|
121
|
+
def e_juan(e)
|
122
|
+
traverse(e) + "\n"
|
123
|
+
end
|
124
|
+
|
125
|
+
def e_l(e)
|
126
|
+
r = traverse(e)
|
127
|
+
r << "\n" unless @lg_type == 'abnormal'
|
128
|
+
r
|
129
|
+
end
|
130
|
+
|
131
|
+
def e_lb(e)
|
132
|
+
return '' if e['type']=='old'
|
133
|
+
r = ''
|
134
|
+
r << "\n" if @p_type == 'pre'
|
135
|
+
unless @next_line_buf.empty?
|
136
|
+
r << @next_line_buf + "\n"
|
137
|
+
@next_line_buf = ''
|
138
|
+
end
|
139
|
+
r
|
140
|
+
end
|
141
|
+
|
142
|
+
def e_lem(e)
|
143
|
+
traverse(e)
|
144
|
+
end
|
145
|
+
|
146
|
+
def e_lg(e)
|
147
|
+
traverse(e)
|
148
|
+
end
|
149
|
+
|
150
|
+
def e_list(e)
|
151
|
+
r = traverse(e)
|
152
|
+
r << "\n\n" unless e.parent.name == 'item'
|
153
|
+
r
|
154
|
+
end
|
155
|
+
|
156
|
+
def e_milestone(e)
|
157
|
+
''
|
158
|
+
end
|
159
|
+
|
160
|
+
def e_note(e)
|
161
|
+
if e.has_attribute?('place')
|
162
|
+
if "inline inline2 interlinear".include?(e['place'])
|
163
|
+
r = traverse(e)
|
164
|
+
return "(#{r})"
|
165
|
+
end
|
166
|
+
end
|
167
|
+
''
|
168
|
+
end
|
169
|
+
|
170
|
+
def e_p(e)
|
171
|
+
@p_type = e['type']
|
172
|
+
r = traverse(e) + "\n"
|
173
|
+
@p_type = nil
|
174
|
+
r
|
175
|
+
end
|
176
|
+
|
177
|
+
def e_pb(e)
|
178
|
+
''
|
179
|
+
end
|
180
|
+
|
181
|
+
def e_reg(e)
|
182
|
+
r = ''
|
183
|
+
choice = e.at_xpath('ancestor::choice')
|
184
|
+
r = traverse(e) if choice.nil?
|
185
|
+
r
|
186
|
+
end
|
187
|
+
|
188
|
+
def e_row(e)
|
189
|
+
traverse(e) + "\n"
|
190
|
+
end
|
191
|
+
|
192
|
+
def e_sg(e)
|
193
|
+
'(' + traverse(e) + ')'
|
194
|
+
end
|
195
|
+
|
196
|
+
# speech
|
197
|
+
def e_sp(e)
|
198
|
+
traverse(e)
|
199
|
+
end
|
200
|
+
|
201
|
+
def e_space(e)
|
202
|
+
return '' if e['quantity']=='0'
|
203
|
+
' ' * e['quantity'].to_i
|
204
|
+
end
|
205
|
+
|
206
|
+
def e_t(e)
|
207
|
+
if e.has_attribute? 'place'
|
208
|
+
return '' if e['place'].include? 'foot'
|
209
|
+
end
|
210
|
+
r = traverse(e)
|
211
|
+
|
212
|
+
# 如果不是雙行對照
|
213
|
+
tt = e.at_xpath('ancestor::tt')
|
214
|
+
unless tt.nil?
|
215
|
+
return r if %w(app single-line).include? tt['type']
|
216
|
+
return r if tt['place'] == 'inline'
|
217
|
+
return r if tt['rend'] == 'normal'
|
218
|
+
end
|
219
|
+
|
220
|
+
# 處理雙行對照
|
221
|
+
i = e.xpath('../t').index(e)
|
222
|
+
case i
|
223
|
+
when 0
|
224
|
+
return r + ' '
|
225
|
+
when 1
|
226
|
+
@next_line_buf << r + ' '
|
227
|
+
return ''
|
228
|
+
else
|
229
|
+
return r
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def e_table(e)
|
234
|
+
traverse(e) + "\n"
|
235
|
+
end
|
236
|
+
|
237
|
+
def e_term(e)
|
238
|
+
norm = true
|
239
|
+
if e['behaviour'] == "no-norm"
|
240
|
+
norm = false
|
241
|
+
end
|
242
|
+
@gaiji_norm.push norm
|
243
|
+
r = traverse(e)
|
244
|
+
@gaiji_norm.pop
|
245
|
+
r
|
246
|
+
end
|
247
|
+
|
248
|
+
def e_text(e)
|
249
|
+
norm = true
|
250
|
+
if e['behaviour'] == "no-norm"
|
251
|
+
norm = false
|
252
|
+
end
|
253
|
+
@gaiji_norm.push norm
|
254
|
+
r = traverse(e)
|
255
|
+
@gaiji_norm.pop
|
256
|
+
r
|
257
|
+
end
|
258
|
+
|
259
|
+
def e_tt(e)
|
260
|
+
traverse(e)
|
261
|
+
end
|
262
|
+
|
263
|
+
def e_unclear(e)
|
264
|
+
r = traverse(e)
|
265
|
+
r = '▆' if r.empty?
|
266
|
+
r
|
267
|
+
end
|
268
|
+
|
269
|
+
|
270
|
+
def handle_node(e)
|
271
|
+
return '' if e.comment?
|
272
|
+
return handle_text(e) if e.text?
|
273
|
+
return '' if PASS.include?(e.name)
|
274
|
+
send("e_#{e.name}", e)
|
275
|
+
end
|
276
|
+
|
277
|
+
def handle_text(e)
|
278
|
+
s = e.content().chomp
|
279
|
+
return '' if s.empty?
|
280
|
+
return '' if e.parent.name == 'app'
|
281
|
+
|
282
|
+
# cbeta xml 文字之間會有多餘的換行
|
283
|
+
r = s.gsub(/[\n\r]/, '')
|
284
|
+
|
285
|
+
if @format == 'html'
|
286
|
+
r = CGI.escapeHTML(r) # 把 & 轉為 &
|
287
|
+
end
|
288
|
+
|
289
|
+
r
|
290
|
+
end
|
291
|
+
|
292
|
+
def traverse(e)
|
293
|
+
r = ''
|
294
|
+
e.children.each do |c|
|
295
|
+
r << handle_node(c)
|
296
|
+
end
|
297
|
+
r
|
298
|
+
end
|
299
|
+
|
300
|
+
end
|
data/lib/cbeta.rb
CHANGED
data/lib/data/cbeta_gaiji.json
CHANGED
@@ -174568,5 +174568,17 @@
|
|
174568
174568
|
"composition": "[弓*并]",
|
174569
174569
|
"moe_variant_id": "C03427",
|
174570
174570
|
"pua": "U+F87C6"
|
174571
|
+
},
|
174572
|
+
"CB34759": {
|
174573
|
+
"composition": "[△@▲]",
|
174574
|
+
"pua": "U+F87C7"
|
174575
|
+
},
|
174576
|
+
"CB34760": {
|
174577
|
+
"unicode": "20B4F",
|
174578
|
+
"uni_char": "𠭏",
|
174579
|
+
"composition": "[山/〦/中/又]",
|
174580
|
+
"norm_big5_char": "事",
|
174581
|
+
"moe_variant_id": "A00048-003",
|
174582
|
+
"pua": "U+F87C8"
|
174571
174583
|
}
|
174572
174584
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-03-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
|
14
14
|
email: zhoubx@gmail.com
|
@@ -31,6 +31,7 @@ files:
|
|
31
31
|
- lib/cbeta/p5a_to_simple_html.rb
|
32
32
|
- lib/cbeta/p5a_to_text.rb
|
33
33
|
- lib/cbeta/p5a_validator.rb
|
34
|
+
- lib/cbeta/xml_document.rb
|
34
35
|
- lib/data/canons.csv
|
35
36
|
- lib/data/categories.json
|
36
37
|
- lib/data/cbeta_gaiji.json
|
@@ -57,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
58
|
- !ruby/object:Gem::Version
|
58
59
|
version: '0'
|
59
60
|
requirements: []
|
60
|
-
rubygems_version: 3.
|
61
|
+
rubygems_version: 3.5.6
|
61
62
|
signing_key:
|
62
63
|
specification_version: 4
|
63
64
|
summary: CBETA Tools
|