cbeta 3.5.4 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta/cbeta_share.rb +7 -1
- data/lib/cbeta/p5a_checker.rb +162 -0
- data/lib/cbeta.rb +1 -0
- data/lib/data/cbeta_gaiji.json +133 -4
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d57589aae785253515b3d2e73f7e5ee0f2267eb5cfd78ffc07a31cda853e0dc4
|
4
|
+
data.tar.gz: 636713370e0c81c2b25121be8aa58f4d959eb24c0297f64d6a6300431848b805
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de9137c36aad2f633b1b682d7b22a32d99d260b7e2d5a506bd404a2a13596098755884da9292dacf6692f9751b1744c9f8eccdd5aacc8670c66cadd0ab94a287
|
7
|
+
data.tar.gz: 8885756792aeb5e473fbf4ffa638c03ba330156a10e7f800464146c93b81defb9a7b8ed5103c6b9cee010c6714b3e361eb1ae0e685dd9f94fc29eb9f2dab1fdc
|
data/lib/cbeta/cbeta_share.rb
CHANGED
@@ -0,0 +1,162 @@
|
|
1
|
+
require_relative 'cbeta_share'
|
2
|
+
|
3
|
+
# 檢查 CBETA XML P5a
|
4
|
+
class CBETA::P5aChecker
|
5
|
+
# @param xml_root [String] 來源 CBETA XML P5a 路徑
|
6
|
+
# @param figures [String] 插圖 路徑 (可由 https://github.com/cbeta-git/CBR2X-figures 取得)
|
7
|
+
# @param log [String] Log file path
|
8
|
+
def initialize(xml_root: nil, figures: nil, log: nil)
|
9
|
+
@gaijis = CBETA::Gaiji.new
|
10
|
+
@xml_root = xml_root
|
11
|
+
@figures = figures
|
12
|
+
@log = log
|
13
|
+
end
|
14
|
+
|
15
|
+
def check
|
16
|
+
@errors = ''
|
17
|
+
@g_errors = {}
|
18
|
+
puts "xml: #{@xml_root}"
|
19
|
+
each_canon(@xml_root) do |c|
|
20
|
+
@canon = c
|
21
|
+
path = File.join(@xml_root, @canon)
|
22
|
+
handle_canon(path)
|
23
|
+
end
|
24
|
+
|
25
|
+
@g_errors.keys.sort.each do |k|
|
26
|
+
s = @g_errors[k].to_a.join(',')
|
27
|
+
@errors << "#{k} 無缺字資料,出現於:#{s}\n"
|
28
|
+
end
|
29
|
+
|
30
|
+
if @errors.empty?
|
31
|
+
puts "檢查完成,未發現錯誤。"
|
32
|
+
elsif @log.nil?
|
33
|
+
puts "\n發現錯誤:"
|
34
|
+
puts @errors
|
35
|
+
else
|
36
|
+
File.write(@log, @errors)
|
37
|
+
puts "\n發現錯誤,請查看 #{@log}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
include CbetaShare
|
44
|
+
|
45
|
+
def chk_text(node)
|
46
|
+
return if node.text.strip.empty?
|
47
|
+
if node.parent.name == 'div'
|
48
|
+
error "lb: #{@lb}, text: #{node.text.inspect}", type: "[E02] 文字直接出現在 div 下"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def e_g(e)
|
53
|
+
gid = e['ref'][1..-1]
|
54
|
+
unless @gaijis.key? gid
|
55
|
+
@g_errors[gid] = Set.new unless @g_errors.key? gid
|
56
|
+
@g_errors[gid] << @basename
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def e_graphic(e)
|
61
|
+
url = File.basename(e['url'])
|
62
|
+
fn = File.join(@figures, @canon, url)
|
63
|
+
unless File.exist? fn
|
64
|
+
error "圖檔 #{url} 不存在"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def e_lb(e)
|
69
|
+
return if e['type']=='old'
|
70
|
+
unless e['n'].match(/^[a-z\d]\d{3}[a-z]\d+$/)
|
71
|
+
error "lb format error: #{e['n']}"
|
72
|
+
end
|
73
|
+
|
74
|
+
@lb = e['n']
|
75
|
+
ed_lb = "#{e['ed']}#{@lb}"
|
76
|
+
if @lbs.include? ed_lb
|
77
|
+
unless e['ed'].start_with?('R')
|
78
|
+
error "lb: #{@lb}, ed: #{e['ed']}", type: "[E01] 行號重複"
|
79
|
+
end
|
80
|
+
else
|
81
|
+
@lbs << ed_lb
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def e_lem(e)
|
86
|
+
unless e.key?('wit')
|
87
|
+
error "lem 缺少 wit 屬性"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def e_rdg(e)
|
92
|
+
return if e['type'] == 'cbetaRemark'
|
93
|
+
unless e.key?('wit')
|
94
|
+
error "rdg 缺少 wit 屬性, lb: #{@lb}"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def error(msg, type: nil)
|
99
|
+
s = ''
|
100
|
+
s << "#{type}: " unless type.nil?
|
101
|
+
s << "#{@basename}, #{msg}"
|
102
|
+
puts s
|
103
|
+
@errors << s + "\n"
|
104
|
+
end
|
105
|
+
|
106
|
+
def handle_canon(folder)
|
107
|
+
Dir.entries(folder).sort.each do |f|
|
108
|
+
next if f.start_with? '.'
|
109
|
+
@vol = f
|
110
|
+
$stderr.puts @vol + ' '
|
111
|
+
path = File.join(folder, @vol)
|
112
|
+
handle_vol(path)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def handle_file(fn)
|
117
|
+
@basename = File.basename(fn)
|
118
|
+
|
119
|
+
s = File.read(fn)
|
120
|
+
if s.include? "\u200B"
|
121
|
+
@errors << "#{@basename} 含有 U+200B Zero Width Space 字元\n"
|
122
|
+
end
|
123
|
+
|
124
|
+
doc = Nokogiri::XML(s)
|
125
|
+
if doc.errors.empty?
|
126
|
+
doc.remove_namespaces!
|
127
|
+
@lbs = Set.new
|
128
|
+
traverse(doc.root)
|
129
|
+
else
|
130
|
+
@errors << "錯誤: #{@basename} not well-formed\n"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def handle_node(e)
|
135
|
+
case e.name
|
136
|
+
when 'g' then e_g(e)
|
137
|
+
when 'graphic' then e_graphic(e)
|
138
|
+
when 'lb' then e_lb(e)
|
139
|
+
when 'lem' then e_lem(e)
|
140
|
+
when 'rdg' then e_rdg(e)
|
141
|
+
else traverse(e)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def handle_vol(folder)
|
146
|
+
Dir.entries(folder).sort.each do |f|
|
147
|
+
next if f.start_with? '.'
|
148
|
+
path = File.join(folder, f)
|
149
|
+
handle_file(path)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def traverse(e)
|
154
|
+
e.children.each { |c|
|
155
|
+
if c.text?
|
156
|
+
chk_text(c)
|
157
|
+
elsif e.element?
|
158
|
+
handle_node(c)
|
159
|
+
end
|
160
|
+
}
|
161
|
+
end
|
162
|
+
end
|
data/lib/cbeta.rb
CHANGED
@@ -282,6 +282,7 @@ require 'cbeta/canon'
|
|
282
282
|
require 'cbeta/char_count'
|
283
283
|
require 'cbeta/char_freq'
|
284
284
|
require 'cbeta/html_to_pdf'
|
285
|
+
require 'cbeta/p5a_checker'
|
285
286
|
require 'cbeta/p5a_to_html'
|
286
287
|
require 'cbeta/p5a_to_html_for_every_edition'
|
287
288
|
require 'cbeta/p5a_to_html_for_pdf'
|
data/lib/data/cbeta_gaiji.json
CHANGED
@@ -122910,6 +122910,7 @@
|
|
122910
122910
|
},
|
122911
122911
|
"CB22704": {
|
122912
122912
|
"composition": "[諒-口+日]",
|
122913
|
+
"moe_variant_id": "A03845-001",
|
122913
122914
|
"pua": "U+F58B0"
|
122914
122915
|
},
|
122915
122916
|
"CB22705": {
|
@@ -123668,6 +123669,7 @@
|
|
123668
123669
|
"unicode": "2087D",
|
123669
123670
|
"uni_char": "𠡽",
|
123670
123671
|
"composition": "[(京-口+日)*力]",
|
123672
|
+
"moe_variant_id": "B00255-001",
|
123671
123673
|
"pua": "U+F596A"
|
123672
123674
|
},
|
123673
123675
|
"CB22891": {
|
@@ -136899,6 +136901,7 @@
|
|
136899
136901
|
"unicode": "22C4A",
|
136900
136902
|
"uni_char": "𢱊",
|
136901
136903
|
"composition": "[掠-口+日]",
|
136904
|
+
"moe_variant_id": "A01586-001",
|
136902
136905
|
"pua": "U+F6476"
|
136903
136906
|
},
|
136904
136907
|
"CB25719": {
|
@@ -140257,6 +140260,7 @@
|
|
140257
140260
|
"unicode": "7174",
|
140258
140261
|
"uni_char": "煴",
|
140259
140262
|
"composition": "[火*(日/皿)]",
|
140263
|
+
"moe_variant_id": "B02420-003",
|
140260
140264
|
"pua": "U+F6796"
|
140261
140265
|
},
|
140262
140266
|
"CB26519": {
|
@@ -140701,7 +140705,10 @@
|
|
140701
140705
|
"pua": "U+F680A"
|
140702
140706
|
},
|
140703
140707
|
"CB26635": {
|
140708
|
+
"unicode": "2907F",
|
140709
|
+
"uni_char": "𩁿",
|
140704
140710
|
"composition": "[雨/(匚@一)]",
|
140711
|
+
"moe_variant_id": "A03605-029-1",
|
140705
140712
|
"pua": "U+F680B"
|
140706
140713
|
},
|
140707
140714
|
"CB26636": {
|
@@ -142366,10 +142373,10 @@
|
|
142366
142373
|
"pua": "U+F69AF"
|
142367
142374
|
},
|
142368
142375
|
"CB27057": {
|
142369
|
-
"unicode": "
|
142370
|
-
"uni_char": "
|
142376
|
+
"unicode": "22A8A",
|
142377
|
+
"uni_char": "𢪊",
|
142371
142378
|
"composition": "[打-丁+(敲-高)]",
|
142372
|
-
"moe_variant_id": "
|
142379
|
+
"moe_variant_id": "C04127",
|
142373
142380
|
"pua": "U+F69B1"
|
142374
142381
|
},
|
142375
142382
|
"CB27058": {
|
@@ -145071,6 +145078,7 @@
|
|
145071
145078
|
},
|
145072
145079
|
"CB27738": {
|
145073
145080
|
"composition": "[打-丁+(孝-子+丁)]",
|
145081
|
+
"moe_variant_id": "A01563-001",
|
145074
145082
|
"pua": "U+F6C5A"
|
145075
145083
|
},
|
145076
145084
|
"CB27739": {
|
@@ -155500,6 +155508,7 @@
|
|
155500
155508
|
"unicode": "29D4B",
|
155501
155509
|
"uni_char": "𩵋",
|
155502
155510
|
"composition": "[魚-(烈-列)+大]",
|
155511
|
+
"moe_variant_id": "A04691-004",
|
155503
155512
|
"pua": "U+F75A0"
|
155504
155513
|
},
|
155505
155514
|
"CB30113": {
|
@@ -164191,6 +164200,7 @@
|
|
164191
164200
|
"unicode": "5BAB",
|
164192
164201
|
"uni_char": "宫",
|
164193
164202
|
"composition": "[宋-木+(口/口)]",
|
164203
|
+
"moe_variant_id": "A01028-001",
|
164194
164204
|
"pua": "U+F7BCF"
|
164195
164205
|
},
|
164196
164206
|
"CB31696": {
|
@@ -168131,6 +168141,7 @@
|
|
168131
168141
|
},
|
168132
168142
|
"CB32550": {
|
168133
168143
|
"composition": "[((嘹-口)-小)-日+(隹/寸)]",
|
168144
|
+
"moe_variant_id": "A00874-017",
|
168134
168145
|
"pua": "U+F7F26"
|
168135
168146
|
},
|
168136
168147
|
"CB32551": {
|
@@ -178076,7 +178087,7 @@
|
|
178076
178087
|
"uni_char": "禄",
|
178077
178088
|
"composition": "[祿-(彖-豕)+(┐@一)]",
|
178078
178089
|
"norm_big5_char": "祿",
|
178079
|
-
"moe_variant_id": "
|
178090
|
+
"moe_variant_id": "A02898-024",
|
178080
178091
|
"pua": "U+F85F2"
|
178081
178092
|
},
|
178082
178093
|
"CB34291": {
|
@@ -178837,6 +178848,7 @@
|
|
178837
178848
|
"unicode": "8F3C",
|
178838
178849
|
"uni_char": "輼",
|
178839
178850
|
"composition": "[輥-比+皿]",
|
178851
|
+
"moe_variant_id": "B05038-001",
|
178840
178852
|
"pua": "U+F866F"
|
178841
178853
|
},
|
178842
178854
|
"CB34416": {
|
@@ -181649,5 +181661,122 @@
|
|
181649
181661
|
"CB34867": {
|
181650
181662
|
"composition": "[恢-火+么]",
|
181651
181663
|
"pua": "U+F8833"
|
181664
|
+
},
|
181665
|
+
"CB34868": {
|
181666
|
+
"unicode": "25B41",
|
181667
|
+
"uni_char": "𥭁",
|
181668
|
+
"composition": "[竺-二+米]",
|
181669
|
+
"pua": "U+F8834"
|
181670
|
+
},
|
181671
|
+
"CB34869": {
|
181672
|
+
"unicode": "20114",
|
181673
|
+
"uni_char": "𠄔",
|
181674
|
+
"composition": "[?予]",
|
181675
|
+
"norm_big5_char": "幻",
|
181676
|
+
"moe_variant_id": "A01197-001",
|
181677
|
+
"pua": "U+F8835"
|
181678
|
+
},
|
181679
|
+
"CB34870": {
|
181680
|
+
"composition": "[八/ㄅ/羽]",
|
181681
|
+
"pua": "U+F8836"
|
181682
|
+
},
|
181683
|
+
"CB34871": {
|
181684
|
+
"unicode": "6E0C",
|
181685
|
+
"uni_char": "渌",
|
181686
|
+
"composition": "[淥-(彖-豕)+(┐@一)]",
|
181687
|
+
"norm_big5_char": "淥",
|
181688
|
+
"moe_variant_id": "B02145-001",
|
181689
|
+
"pua": "U+F8837"
|
181690
|
+
},
|
181691
|
+
"CB34872": {
|
181692
|
+
"unicode": "21356",
|
181693
|
+
"uni_char": "𡍖",
|
181694
|
+
"composition": "[土*(彔-(彖-豕)+(┐@一))]",
|
181695
|
+
"norm_big5_char": "埭",
|
181696
|
+
"moe_variant_id": "B00542-006",
|
181697
|
+
"pua": "U+F8838"
|
181698
|
+
},
|
181699
|
+
"CB34874": {
|
181700
|
+
"unicode": "226D5",
|
181701
|
+
"uni_char": "𢛕",
|
181702
|
+
"composition": "[怡-台+囷]",
|
181703
|
+
"moe_variant_id": "C03737",
|
181704
|
+
"pua": "U+F883A"
|
181705
|
+
},
|
181706
|
+
"CB34875": {
|
181707
|
+
"unicode": "260A0",
|
181708
|
+
"uni_char": "𦂠",
|
181709
|
+
"composition": "[綡-口+日]",
|
181710
|
+
"moe_variant_id": "C10288",
|
181711
|
+
"pua": "U+F883B"
|
181712
|
+
},
|
181713
|
+
"CB34876": {
|
181714
|
+
"unicode": "2F825",
|
181715
|
+
"uni_char": "勇",
|
181716
|
+
"composition": "[勇-用+田]",
|
181717
|
+
"norm_big5_char": "勇",
|
181718
|
+
"moe_variant_id": "A00388-002",
|
181719
|
+
"pua": "U+F883C"
|
181720
|
+
},
|
181721
|
+
"CB34877": {
|
181722
|
+
"unicode": "6E07",
|
181723
|
+
"uni_char": "渇",
|
181724
|
+
"composition": "[渴-人+(乏-之)]",
|
181725
|
+
"norm_big5_char": "渴",
|
181726
|
+
"moe_variant_id": "A02267-001",
|
181727
|
+
"pua": "U+F883D"
|
181728
|
+
},
|
181729
|
+
"CB34878": {
|
181730
|
+
"unicode": "9AD9",
|
181731
|
+
"uni_char": "髙",
|
181732
|
+
"composition": "[扃-戶+(〦/(〡*(一/一)*〡))]",
|
181733
|
+
"norm_big5_char": "高",
|
181734
|
+
"moe_variant_id": "A04667-003",
|
181735
|
+
"pua": "U+F883E"
|
181736
|
+
},
|
181737
|
+
"CB34879": {
|
181738
|
+
"unicode": "4E89",
|
181739
|
+
"uni_char": "争",
|
181740
|
+
"composition": "[爭-(采-木)+(色-巴)]",
|
181741
|
+
"norm_big5_char": "爭",
|
181742
|
+
"moe_variant_id": "A02468-001",
|
181743
|
+
"pua": "U+F883F"
|
181744
|
+
},
|
181745
|
+
"CB34880": {
|
181746
|
+
"unicode": "9759",
|
181747
|
+
"uni_char": "静",
|
181748
|
+
"composition": "[靜-(采-木)+(色-巴)]",
|
181749
|
+
"norm_big5_char": "靜",
|
181750
|
+
"moe_variant_id": "A04502-001",
|
181751
|
+
"pua": "U+F8840"
|
181752
|
+
},
|
181753
|
+
"CB34881": {
|
181754
|
+
"norm_unicode": "29C7F",
|
181755
|
+
"norm_uni_char": "𩱿",
|
181756
|
+
"composition": "[鬼-(白-日)+亡]",
|
181757
|
+
"norm_big5_char": "魅",
|
181758
|
+
"pua": "U+F8841"
|
181759
|
+
},
|
181760
|
+
"CB34882": {
|
181761
|
+
"unicode": "226D4",
|
181762
|
+
"uni_char": "𢛔",
|
181763
|
+
"composition": "[怡-台+東]",
|
181764
|
+
"moe_variant_id": "C03728",
|
181765
|
+
"pua": "U+F8842"
|
181766
|
+
},
|
181767
|
+
"CB34883": {
|
181768
|
+
"unicode": "25783",
|
181769
|
+
"uni_char": "𥞃",
|
181770
|
+
"composition": "[禾*出]",
|
181771
|
+
"moe_variant_id": "C09055",
|
181772
|
+
"pua": "U+F8843"
|
181773
|
+
},
|
181774
|
+
"CB34884": {
|
181775
|
+
"unicode": "3943",
|
181776
|
+
"uni_char": "㥃",
|
181777
|
+
"composition": "[怡-台+門]",
|
181778
|
+
"norm_big5_char": "悶",
|
181779
|
+
"moe_variant_id": "A01384-001",
|
181780
|
+
"pua": "U+F8844"
|
181652
181781
|
}
|
181653
181782
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unihan2
|
@@ -16,20 +16,20 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
19
|
+
version: '1.2'
|
20
20
|
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: 1.
|
22
|
+
version: 1.2.0
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
27
|
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '1.
|
29
|
+
version: '1.2'
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: 1.
|
32
|
+
version: 1.2.0
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
34
|
name: nokogiri
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
@@ -85,6 +85,7 @@ files:
|
|
85
85
|
- lib/cbeta/gaiji.rb
|
86
86
|
- lib/cbeta/html_to_pdf.rb
|
87
87
|
- lib/cbeta/html_to_text.rb
|
88
|
+
- lib/cbeta/p5a_checker.rb
|
88
89
|
- lib/cbeta/p5a_to_html.rb
|
89
90
|
- lib/cbeta/p5a_to_html_for_every_edition.rb
|
90
91
|
- lib/cbeta/p5a_to_html_for_pdf.rb
|