cbeta 3.5.4 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e9706507dcec5a3319dbaf853c7cc38c57910508c744b419825b7468d87ff35c
4
- data.tar.gz: 4eaf41cc673fe83fb43b276be541496cc184b22e1b3a3a48c428e57555f4d77f
3
+ metadata.gz: d57589aae785253515b3d2e73f7e5ee0f2267eb5cfd78ffc07a31cda853e0dc4
4
+ data.tar.gz: 636713370e0c81c2b25121be8aa58f4d959eb24c0297f64d6a6300431848b805
5
5
  SHA512:
6
- metadata.gz: a4b71cf41c24169e6045ef9db81b59f2b41834fa1e76650c4031322405558505a539ef6d0c80081c51e8834cf6e10e38980ee4ef6513cc22c1b46f689bffd7f1
7
- data.tar.gz: 5e7b0767fe9e2d0ad294567064664c82050bc53b26ded128df005c6849ed525aae3162603b6ff143e2668d96638dcf5921f09836084510f57098699860e1e0bf
6
+ metadata.gz: de9137c36aad2f633b1b682d7b22a32d99d260b7e2d5a506bd404a2a13596098755884da9292dacf6692f9751b1744c9f8eccdd5aacc8670c66cadd0ab94a287
7
+ data.tar.gz: 8885756792aeb5e473fbf4ffa638c03ba330156a10e7f800464146c93b81defb9a7b8ed5103c6b9cee010c6714b3e361eb1ae0e685dd9f94fc29eb9f2dab1fdc
@@ -1,4 +1,10 @@
1
1
  module CbetaShare
2
+ def each_canon(xml_root)
3
+ Dir.entries(xml_root).sort.each do |c|
4
+ next unless c.match(/^#{CBETA::CANON}$/)
5
+ yield(c)
6
+ end
7
+ end
2
8
 
3
9
  def to_html(e)
4
10
  e.to_xml(
@@ -8,4 +14,4 @@ module CbetaShare
8
14
  )
9
15
  end
10
16
 
11
- end
17
+ end
@@ -0,0 +1,162 @@
1
+ require_relative 'cbeta_share'
2
+
3
+ # 檢查 CBETA XML P5a
4
+ class CBETA::P5aChecker
5
+ # @param xml_root [String] 來源 CBETA XML P5a 路徑
6
+ # @param figures [String] 插圖 路徑 (可由 https://github.com/cbeta-git/CBR2X-figures 取得)
7
+ # @param log [String] Log file path
8
+ def initialize(xml_root: nil, figures: nil, log: nil)
9
+ @gaijis = CBETA::Gaiji.new
10
+ @xml_root = xml_root
11
+ @figures = figures
12
+ @log = log
13
+ end
14
+
15
+ def check
16
+ @errors = ''
17
+ @g_errors = {}
18
+ puts "xml: #{@xml_root}"
19
+ each_canon(@xml_root) do |c|
20
+ @canon = c
21
+ path = File.join(@xml_root, @canon)
22
+ handle_canon(path)
23
+ end
24
+
25
+ @g_errors.keys.sort.each do |k|
26
+ s = @g_errors[k].to_a.join(',')
27
+ @errors << "#{k} 無缺字資料,出現於:#{s}\n"
28
+ end
29
+
30
+ if @errors.empty?
31
+ puts "檢查完成,未發現錯誤。"
32
+ elsif @log.nil?
33
+ puts "\n發現錯誤:"
34
+ puts @errors
35
+ else
36
+ File.write(@log, @errors)
37
+ puts "\n發現錯誤,請查看 #{@log}"
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ include CbetaShare
44
+
45
+ def chk_text(node)
46
+ return if node.text.strip.empty?
47
+ if node.parent.name == 'div'
48
+ error "lb: #{@lb}, text: #{node.text.inspect}", type: "[E02] 文字直接出現在 div 下"
49
+ end
50
+ end
51
+
52
+ def e_g(e)
53
+ gid = e['ref'][1..-1]
54
+ unless @gaijis.key? gid
55
+ @g_errors[gid] = Set.new unless @g_errors.key? gid
56
+ @g_errors[gid] << @basename
57
+ end
58
+ end
59
+
60
+ def e_graphic(e)
61
+ url = File.basename(e['url'])
62
+ fn = File.join(@figures, @canon, url)
63
+ unless File.exist? fn
64
+ error "圖檔 #{url} 不存在"
65
+ end
66
+ end
67
+
68
+ def e_lb(e)
69
+ return if e['type']=='old'
70
+ unless e['n'].match(/^[a-z\d]\d{3}[a-z]\d+$/)
71
+ error "lb format error: #{e['n']}"
72
+ end
73
+
74
+ @lb = e['n']
75
+ ed_lb = "#{e['ed']}#{@lb}"
76
+ if @lbs.include? ed_lb
77
+ unless e['ed'].start_with?('R')
78
+ error "lb: #{@lb}, ed: #{e['ed']}", type: "[E01] 行號重複"
79
+ end
80
+ else
81
+ @lbs << ed_lb
82
+ end
83
+ end
84
+
85
+ def e_lem(e)
86
+ unless e.key?('wit')
87
+ error "lem 缺少 wit 屬性"
88
+ end
89
+ end
90
+
91
+ def e_rdg(e)
92
+ return if e['type'] == 'cbetaRemark'
93
+ unless e.key?('wit')
94
+ error "rdg 缺少 wit 屬性, lb: #{@lb}"
95
+ end
96
+ end
97
+
98
+ def error(msg, type: nil)
99
+ s = ''
100
+ s << "#{type}: " unless type.nil?
101
+ s << "#{@basename}, #{msg}"
102
+ puts s
103
+ @errors << s + "\n"
104
+ end
105
+
106
+ def handle_canon(folder)
107
+ Dir.entries(folder).sort.each do |f|
108
+ next if f.start_with? '.'
109
+ @vol = f
110
+ $stderr.puts @vol + ' '
111
+ path = File.join(folder, @vol)
112
+ handle_vol(path)
113
+ end
114
+ end
115
+
116
+ def handle_file(fn)
117
+ @basename = File.basename(fn)
118
+
119
+ s = File.read(fn)
120
+ if s.include? "\u200B"
121
+ @errors << "#{@basename} 含有 U+200B Zero Width Space 字元\n"
122
+ end
123
+
124
+ doc = Nokogiri::XML(s)
125
+ if doc.errors.empty?
126
+ doc.remove_namespaces!
127
+ @lbs = Set.new
128
+ traverse(doc.root)
129
+ else
130
+ @errors << "錯誤: #{@basename} not well-formed\n"
131
+ end
132
+ end
133
+
134
+ def handle_node(e)
135
+ case e.name
136
+ when 'g' then e_g(e)
137
+ when 'graphic' then e_graphic(e)
138
+ when 'lb' then e_lb(e)
139
+ when 'lem' then e_lem(e)
140
+ when 'rdg' then e_rdg(e)
141
+ else traverse(e)
142
+ end
143
+ end
144
+
145
+ def handle_vol(folder)
146
+ Dir.entries(folder).sort.each do |f|
147
+ next if f.start_with? '.'
148
+ path = File.join(folder, f)
149
+ handle_file(path)
150
+ end
151
+ end
152
+
153
+ def traverse(e)
154
+ e.children.each { |c|
155
+ if c.text?
156
+ chk_text(c)
157
+ elsif e.element?
158
+ handle_node(c)
159
+ end
160
+ }
161
+ end
162
+ end
data/lib/cbeta.rb CHANGED
@@ -282,6 +282,7 @@ require 'cbeta/canon'
282
282
  require 'cbeta/char_count'
283
283
  require 'cbeta/char_freq'
284
284
  require 'cbeta/html_to_pdf'
285
+ require 'cbeta/p5a_checker'
285
286
  require 'cbeta/p5a_to_html'
286
287
  require 'cbeta/p5a_to_html_for_every_edition'
287
288
  require 'cbeta/p5a_to_html_for_pdf'
@@ -122910,6 +122910,7 @@
122910
122910
  },
122911
122911
  "CB22704": {
122912
122912
  "composition": "[諒-口+日]",
122913
+ "moe_variant_id": "A03845-001",
122913
122914
  "pua": "U+F58B0"
122914
122915
  },
122915
122916
  "CB22705": {
@@ -123668,6 +123669,7 @@
123668
123669
  "unicode": "2087D",
123669
123670
  "uni_char": "𠡽",
123670
123671
  "composition": "[(京-口+日)*力]",
123672
+ "moe_variant_id": "B00255-001",
123671
123673
  "pua": "U+F596A"
123672
123674
  },
123673
123675
  "CB22891": {
@@ -136899,6 +136901,7 @@
136899
136901
  "unicode": "22C4A",
136900
136902
  "uni_char": "𢱊",
136901
136903
  "composition": "[掠-口+日]",
136904
+ "moe_variant_id": "A01586-001",
136902
136905
  "pua": "U+F6476"
136903
136906
  },
136904
136907
  "CB25719": {
@@ -140257,6 +140260,7 @@
140257
140260
  "unicode": "7174",
140258
140261
  "uni_char": "煴",
140259
140262
  "composition": "[火*(日/皿)]",
140263
+ "moe_variant_id": "B02420-003",
140260
140264
  "pua": "U+F6796"
140261
140265
  },
140262
140266
  "CB26519": {
@@ -140701,7 +140705,10 @@
140701
140705
  "pua": "U+F680A"
140702
140706
  },
140703
140707
  "CB26635": {
140708
+ "unicode": "2907F",
140709
+ "uni_char": "𩁿",
140704
140710
  "composition": "[雨/(匚@一)]",
140711
+ "moe_variant_id": "A03605-029-1",
140705
140712
  "pua": "U+F680B"
140706
140713
  },
140707
140714
  "CB26636": {
@@ -142366,10 +142373,10 @@
142366
142373
  "pua": "U+F69AF"
142367
142374
  },
142368
142375
  "CB27057": {
142369
- "unicode": "233FD",
142370
- "uni_char": "𣏽",
142376
+ "unicode": "22A8A",
142377
+ "uni_char": "𢪊",
142371
142378
  "composition": "[打-丁+(敲-高)]",
142372
- "moe_variant_id": "a01880-002",
142379
+ "moe_variant_id": "C04127",
142373
142380
  "pua": "U+F69B1"
142374
142381
  },
142375
142382
  "CB27058": {
@@ -145071,6 +145078,7 @@
145071
145078
  },
145072
145079
  "CB27738": {
145073
145080
  "composition": "[打-丁+(孝-子+丁)]",
145081
+ "moe_variant_id": "A01563-001",
145074
145082
  "pua": "U+F6C5A"
145075
145083
  },
145076
145084
  "CB27739": {
@@ -155500,6 +155508,7 @@
155500
155508
  "unicode": "29D4B",
155501
155509
  "uni_char": "𩵋",
155502
155510
  "composition": "[魚-(烈-列)+大]",
155511
+ "moe_variant_id": "A04691-004",
155503
155512
  "pua": "U+F75A0"
155504
155513
  },
155505
155514
  "CB30113": {
@@ -164191,6 +164200,7 @@
164191
164200
  "unicode": "5BAB",
164192
164201
  "uni_char": "宫",
164193
164202
  "composition": "[宋-木+(口/口)]",
164203
+ "moe_variant_id": "A01028-001",
164194
164204
  "pua": "U+F7BCF"
164195
164205
  },
164196
164206
  "CB31696": {
@@ -168131,6 +168141,7 @@
168131
168141
  },
168132
168142
  "CB32550": {
168133
168143
  "composition": "[((嘹-口)-小)-日+(隹/寸)]",
168144
+ "moe_variant_id": "A00874-017",
168134
168145
  "pua": "U+F7F26"
168135
168146
  },
168136
168147
  "CB32551": {
@@ -178076,7 +178087,7 @@
178076
178087
  "uni_char": "禄",
178077
178088
  "composition": "[祿-(彖-豕)+(┐@一)]",
178078
178089
  "norm_big5_char": "祿",
178079
- "moe_variant_id": "a02898-008",
178090
+ "moe_variant_id": "A02898-024",
178080
178091
  "pua": "U+F85F2"
178081
178092
  },
178082
178093
  "CB34291": {
@@ -178837,6 +178848,7 @@
178837
178848
  "unicode": "8F3C",
178838
178849
  "uni_char": "輼",
178839
178850
  "composition": "[輥-比+皿]",
178851
+ "moe_variant_id": "B05038-001",
178840
178852
  "pua": "U+F866F"
178841
178853
  },
178842
178854
  "CB34416": {
@@ -181649,5 +181661,122 @@
181649
181661
  "CB34867": {
181650
181662
  "composition": "[恢-火+么]",
181651
181663
  "pua": "U+F8833"
181664
+ },
181665
+ "CB34868": {
181666
+ "unicode": "25B41",
181667
+ "uni_char": "𥭁",
181668
+ "composition": "[竺-二+米]",
181669
+ "pua": "U+F8834"
181670
+ },
181671
+ "CB34869": {
181672
+ "unicode": "20114",
181673
+ "uni_char": "𠄔",
181674
+ "composition": "[?予]",
181675
+ "norm_big5_char": "幻",
181676
+ "moe_variant_id": "A01197-001",
181677
+ "pua": "U+F8835"
181678
+ },
181679
+ "CB34870": {
181680
+ "composition": "[八/ㄅ/羽]",
181681
+ "pua": "U+F8836"
181682
+ },
181683
+ "CB34871": {
181684
+ "unicode": "6E0C",
181685
+ "uni_char": "渌",
181686
+ "composition": "[淥-(彖-豕)+(┐@一)]",
181687
+ "norm_big5_char": "淥",
181688
+ "moe_variant_id": "B02145-001",
181689
+ "pua": "U+F8837"
181690
+ },
181691
+ "CB34872": {
181692
+ "unicode": "21356",
181693
+ "uni_char": "𡍖",
181694
+ "composition": "[土*(彔-(彖-豕)+(┐@一))]",
181695
+ "norm_big5_char": "埭",
181696
+ "moe_variant_id": "B00542-006",
181697
+ "pua": "U+F8838"
181698
+ },
181699
+ "CB34874": {
181700
+ "unicode": "226D5",
181701
+ "uni_char": "𢛕",
181702
+ "composition": "[怡-台+囷]",
181703
+ "moe_variant_id": "C03737",
181704
+ "pua": "U+F883A"
181705
+ },
181706
+ "CB34875": {
181707
+ "unicode": "260A0",
181708
+ "uni_char": "𦂠",
181709
+ "composition": "[綡-口+日]",
181710
+ "moe_variant_id": "C10288",
181711
+ "pua": "U+F883B"
181712
+ },
181713
+ "CB34876": {
181714
+ "unicode": "2F825",
181715
+ "uni_char": "勇",
181716
+ "composition": "[勇-用+田]",
181717
+ "norm_big5_char": "勇",
181718
+ "moe_variant_id": "A00388-002",
181719
+ "pua": "U+F883C"
181720
+ },
181721
+ "CB34877": {
181722
+ "unicode": "6E07",
181723
+ "uni_char": "渇",
181724
+ "composition": "[渴-人+(乏-之)]",
181725
+ "norm_big5_char": "渴",
181726
+ "moe_variant_id": "A02267-001",
181727
+ "pua": "U+F883D"
181728
+ },
181729
+ "CB34878": {
181730
+ "unicode": "9AD9",
181731
+ "uni_char": "髙",
181732
+ "composition": "[扃-戶+(〦/(〡*(一/一)*〡))]",
181733
+ "norm_big5_char": "高",
181734
+ "moe_variant_id": "A04667-003",
181735
+ "pua": "U+F883E"
181736
+ },
181737
+ "CB34879": {
181738
+ "unicode": "4E89",
181739
+ "uni_char": "争",
181740
+ "composition": "[爭-(采-木)+(色-巴)]",
181741
+ "norm_big5_char": "爭",
181742
+ "moe_variant_id": "A02468-001",
181743
+ "pua": "U+F883F"
181744
+ },
181745
+ "CB34880": {
181746
+ "unicode": "9759",
181747
+ "uni_char": "静",
181748
+ "composition": "[靜-(采-木)+(色-巴)]",
181749
+ "norm_big5_char": "靜",
181750
+ "moe_variant_id": "A04502-001",
181751
+ "pua": "U+F8840"
181752
+ },
181753
+ "CB34881": {
181754
+ "norm_unicode": "29C7F",
181755
+ "norm_uni_char": "𩱿",
181756
+ "composition": "[鬼-(白-日)+亡]",
181757
+ "norm_big5_char": "魅",
181758
+ "pua": "U+F8841"
181759
+ },
181760
+ "CB34882": {
181761
+ "unicode": "226D4",
181762
+ "uni_char": "𢛔",
181763
+ "composition": "[怡-台+東]",
181764
+ "moe_variant_id": "C03728",
181765
+ "pua": "U+F8842"
181766
+ },
181767
+ "CB34883": {
181768
+ "unicode": "25783",
181769
+ "uni_char": "𥞃",
181770
+ "composition": "[禾*出]",
181771
+ "moe_variant_id": "C09055",
181772
+ "pua": "U+F8843"
181773
+ },
181774
+ "CB34884": {
181775
+ "unicode": "3943",
181776
+ "uni_char": "㥃",
181777
+ "composition": "[怡-台+門]",
181778
+ "norm_big5_char": "悶",
181779
+ "moe_variant_id": "A01384-001",
181780
+ "pua": "U+F8844"
181652
181781
  }
181653
181782
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.5.4
4
+ version: 3.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-03-22 00:00:00.000000000 Z
11
+ date: 2025-04-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unihan2
@@ -16,20 +16,20 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.1'
19
+ version: '1.2'
20
20
  - - ">="
21
21
  - !ruby/object:Gem::Version
22
- version: 1.1.0
22
+ version: 1.2.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
27
  - - "~>"
28
28
  - !ruby/object:Gem::Version
29
- version: '1.1'
29
+ version: '1.2'
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
- version: 1.1.0
32
+ version: 1.2.0
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: nokogiri
35
35
  requirement: !ruby/object:Gem::Requirement
@@ -85,6 +85,7 @@ files:
85
85
  - lib/cbeta/gaiji.rb
86
86
  - lib/cbeta/html_to_pdf.rb
87
87
  - lib/cbeta/html_to_text.rb
88
+ - lib/cbeta/p5a_checker.rb
88
89
  - lib/cbeta/p5a_to_html.rb
89
90
  - lib/cbeta/p5a_to_html_for_every_edition.rb
90
91
  - lib/cbeta/p5a_to_html_for_pdf.rb