metanorma-utils 1.10.3 → 1.11.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/utils/cjk.rb +110 -0
- data/lib/utils/log.rb +2 -2
- data/lib/utils/main.rb +1 -3
- data/lib/utils/version.rb +1 -1
- data/lib/utils/xml.rb +48 -5
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bb5a317fefc85b76b0442eadf4805403c76e68384b9ea22685941442db330a3f
|
4
|
+
data.tar.gz: 01cf01d1d045ab721fdba705d79757af0e18dd025f67f070731a7029f1736c1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be7834de49cb96791995188b265fb1cf97fa29366ff367931007ea4101233aa16317f5a31e4b4cb837b9ea6418f92c68df0848575d3ba6b3fce8e50c072e9571
|
7
|
+
data.tar.gz: 2f2b23b525347b8cb07d2cba54376685357c1ef1b2d78120504f91e91fa10eef1c0298cf5c4ef03ef1e97ee43d9ac9340b4c2657d2196980dfd548ee1b4f7a94
|
data/lib/utils/cjk.rb
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
module Metanorma
|
2
|
+
module Utils
|
3
|
+
class << self
|
4
|
+
# Basic CJK scripts
|
5
|
+
HAN = "\\p{Han}".freeze
|
6
|
+
BOPOMOFO = "\\p{Bopomofo}".freeze
|
7
|
+
HANGUL = "\\p{Hangul}".freeze
|
8
|
+
HIRAGANA = "\\p{Hiragana}".freeze
|
9
|
+
KATAKANA = "\\p{Katakana}".freeze
|
10
|
+
|
11
|
+
# Script extensions - characters shared between scripts
|
12
|
+
|
13
|
+
# CJK Symbols and Punctuation (U+3000–U+303F)
|
14
|
+
# Used across all CJK scripts
|
15
|
+
CJK_SYMBOLS = "[\\u3000-\\u303F]".freeze
|
16
|
+
|
17
|
+
# CJK Punctuation (subset of CJK Symbols commonly used)
|
18
|
+
CJK_PUNCTUATION = "[\\u3001-\\u3003\\u3008-\\u3011\\u3014-\\u301F]".freeze
|
19
|
+
|
20
|
+
# Halfwidth and Fullwidth Forms (U+FF00–U+FFEF)
|
21
|
+
# Used in all CJK contexts
|
22
|
+
CJK_HALFWIDTH_FULLWIDTH = "[\\uFF00-\\uFFEF]".freeze
|
23
|
+
|
24
|
+
# CJK Compatibility Forms (U+FE30–U+FE4F)
|
25
|
+
# Primarily used with Han but relevant for all CJK
|
26
|
+
CJK_COMPAT = "[\\uFE30-\\uFE4F]".freeze
|
27
|
+
|
28
|
+
# Vertical Forms (U+FE10–U+FE1F)
|
29
|
+
# Used in vertical text layout for all CJK
|
30
|
+
CJK_VERTICAL = "[\\uFE10-\\uFE1F]".freeze
|
31
|
+
|
32
|
+
# Small Form Variants (U+FE50–U+FE6F)
|
33
|
+
# Used in all CJK contexts
|
34
|
+
CJK_SMALL_FORMS = "[\\uFE50-\\uFE6F]".freeze
|
35
|
+
|
36
|
+
# Ideographic Description Characters (U+2FF0–U+2FFF)
|
37
|
+
# Used with Han script
|
38
|
+
HAN_IDC = "[\\u2FF0-\\u2FFF]".freeze
|
39
|
+
|
40
|
+
# Kanbun (U+3190–U+319F)
|
41
|
+
# Used with Han script for Japanese
|
42
|
+
KANBUN = "[\\u3190-\\u319F]".freeze
|
43
|
+
|
44
|
+
# CJK Compatibility (U+3300–U+33FF)
|
45
|
+
# Used with Han script
|
46
|
+
CJK_COMPAT_IDEOGRAPHS = "[\\u3300-\\u33FF]".freeze
|
47
|
+
|
48
|
+
# CJK Compatibility Ideographs (U+F900–U+FAFF)
|
49
|
+
HAN_COMPAT_IDEOGRAPHS = "[\\uF900-\\uFAFF]".freeze
|
50
|
+
|
51
|
+
# Script extensions by primary script
|
52
|
+
HAN_EXTENSIONS = [
|
53
|
+
HAN,
|
54
|
+
CJK_SYMBOLS,
|
55
|
+
CJK_PUNCTUATION,
|
56
|
+
CJK_HALFWIDTH_FULLWIDTH,
|
57
|
+
CJK_COMPAT,
|
58
|
+
CJK_VERTICAL,
|
59
|
+
CJK_SMALL_FORMS,
|
60
|
+
HAN_IDC,
|
61
|
+
KANBUN,
|
62
|
+
CJK_COMPAT_IDEOGRAPHS,
|
63
|
+
HAN_COMPAT_IDEOGRAPHS
|
64
|
+
].join("|").freeze
|
65
|
+
|
66
|
+
HANGUL_EXTENSIONS = [
|
67
|
+
HANGUL,
|
68
|
+
CJK_SYMBOLS,
|
69
|
+
CJK_PUNCTUATION,
|
70
|
+
CJK_HALFWIDTH_FULLWIDTH,
|
71
|
+
CJK_VERTICAL,
|
72
|
+
CJK_SMALL_FORMS
|
73
|
+
].join("|").freeze
|
74
|
+
|
75
|
+
HIRAGANA_EXTENSIONS = [
|
76
|
+
HIRAGANA,
|
77
|
+
CJK_SYMBOLS,
|
78
|
+
CJK_PUNCTUATION,
|
79
|
+
CJK_HALFWIDTH_FULLWIDTH,
|
80
|
+
CJK_VERTICAL,
|
81
|
+
CJK_SMALL_FORMS
|
82
|
+
].join("|").freeze
|
83
|
+
|
84
|
+
KATAKANA_EXTENSIONS = [
|
85
|
+
KATAKANA,
|
86
|
+
CJK_SYMBOLS,
|
87
|
+
CJK_PUNCTUATION,
|
88
|
+
CJK_HALFWIDTH_FULLWIDTH,
|
89
|
+
CJK_VERTICAL,
|
90
|
+
CJK_SMALL_FORMS
|
91
|
+
].join("|").freeze
|
92
|
+
|
93
|
+
BOPOMOFO_EXTENSIONS = [
|
94
|
+
BOPOMOFO,
|
95
|
+
CJK_SYMBOLS,
|
96
|
+
CJK_PUNCTUATION,
|
97
|
+
CJK_HALFWIDTH_FULLWIDTH
|
98
|
+
].join("|").freeze
|
99
|
+
|
100
|
+
# Combined CJK pattern including all script extensions
|
101
|
+
CJK = [
|
102
|
+
HAN_EXTENSIONS,
|
103
|
+
HANGUL_EXTENSIONS,
|
104
|
+
HIRAGANA_EXTENSIONS,
|
105
|
+
KATAKANA_EXTENSIONS,
|
106
|
+
BOPOMOFO_EXTENSIONS
|
107
|
+
].join("|").freeze
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
data/lib/utils/log.rb
CHANGED
@@ -86,7 +86,7 @@ module Metanorma
|
|
86
86
|
while !node.nil? && node["id"].nil? && node.respond_to?(:parent)
|
87
87
|
node = node.parent
|
88
88
|
end
|
89
|
-
node.respond_to?(:parent) ? "ID #{node['id']}" : ""
|
89
|
+
node.respond_to?(:parent) ? "ID #{node['anchor'] || node['id']}" : ""
|
90
90
|
elsif node.is_a? String then node
|
91
91
|
elsif node.respond_to?(:lineno) && !node.lineno.nil? &&
|
92
92
|
!node.lineno.empty?
|
@@ -224,7 +224,7 @@ module Metanorma
|
|
224
224
|
/^ID /.match?(loc) or return [loc, nil]
|
225
225
|
loc.sub!(/^ID /, "")
|
226
226
|
loc = @mapid[loc] while @mapid[loc]
|
227
|
-
url = "#{@htmlfilename}##{loc}"
|
227
|
+
url = "#{@htmlfilename}##{to_ncname loc}"
|
228
228
|
[loc, url]
|
229
229
|
end
|
230
230
|
|
data/lib/utils/main.rb
CHANGED
@@ -5,6 +5,7 @@ require "htmlentities"
|
|
5
5
|
require "nokogiri"
|
6
6
|
require "csv"
|
7
7
|
require_relative "../sterile/sterile"
|
8
|
+
require_relative "cjk"
|
8
9
|
|
9
10
|
module Metanorma
|
10
11
|
module Utils
|
@@ -35,9 +36,6 @@ module Metanorma
|
|
35
36
|
docfile.nil? ? "./" : "#{Pathname.new(docfile).parent}/"
|
36
37
|
end
|
37
38
|
|
38
|
-
CJK = "\\p{Han}|\\p{Bopomofo}|\\p{Hangul}|\\p{Hiragana}|\\p{Katakana}"
|
39
|
-
.freeze
|
40
|
-
|
41
39
|
# TODO needs internationalisation of quote
|
42
40
|
def smartformat(text)
|
43
41
|
ret = HTMLEntities.new.decode(
|
data/lib/utils/version.rb
CHANGED
data/lib/utils/xml.rb
CHANGED
@@ -40,6 +40,41 @@ module Metanorma
|
|
40
40
|
(ret1 || "") + ret2.gsub(%r([#{NAMECHAR}#])o, "_")
|
41
41
|
end
|
42
42
|
|
43
|
+
# Following XML requirements: https://www.w3.org/TR/REC-xml/#NT-Name
|
44
|
+
TAG_NAME_START_CODEPOINTS = "@:A-Z_a-z\u{C0}-\u{D6}\u{D8}-\u{F6}\u{F8}-\u{2FF}\u{370}-\u{37D}\u{37F}-\u{1FFF}" \
|
45
|
+
"\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}" \
|
46
|
+
"\u{FDF0}-\u{FFFD}\u{10000}-\u{EFFFF}"
|
47
|
+
INVALID_TAG_NAME_START_REGEXP = /[^#{TAG_NAME_START_CODEPOINTS}]/
|
48
|
+
TAG_NAME_FOLLOWING_CODEPOINTS = "#{TAG_NAME_START_CODEPOINTS}\\-.0-9\u{B7}\u{0300}-\u{036F}\u{203F}-\u{2040}"
|
49
|
+
INVALID_TAG_NAME_FOLLOWING_REGEXP = /[^#{TAG_NAME_FOLLOWING_CODEPOINTS}]/
|
50
|
+
SAFE_XML_TAG_NAME_REGEXP = /\A[#{TAG_NAME_START_CODEPOINTS}][#{TAG_NAME_FOLLOWING_CODEPOINTS}]*\z/
|
51
|
+
TAG_NAME_REPLACEMENT_CHAR = "_"
|
52
|
+
|
53
|
+
# from: https://github.com/rails/rails/blob/3235827585d87661942c91bc81f64f56d710f0b2/activesupport/lib/active_support/core_ext/erb/util.rb
|
54
|
+
# A utility method for escaping XML names of tags and names of attributes.
|
55
|
+
#
|
56
|
+
# xml_name_escape('1 < 2 & 3')
|
57
|
+
# # => "1___2___3"
|
58
|
+
#
|
59
|
+
# It follows the requirements of the specification: https://www.w3.org/TR/REC-xml/#NT-Name
|
60
|
+
def to_ncname(name, asciionly: true)
|
61
|
+
name = name.to_s
|
62
|
+
return "" if name.nil? || name.empty?
|
63
|
+
return name if name.match?(SAFE_XML_TAG_NAME_REGEXP)
|
64
|
+
|
65
|
+
starting_char = name[0]
|
66
|
+
starting_char.gsub!(INVALID_TAG_NAME_START_REGEXP,
|
67
|
+
TAG_NAME_REPLACEMENT_CHAR)
|
68
|
+
|
69
|
+
return starting_char if name.size == 1
|
70
|
+
|
71
|
+
following_chars = name[1..-1]
|
72
|
+
following_chars.gsub!(INVALID_TAG_NAME_FOLLOWING_REGEXP,
|
73
|
+
TAG_NAME_REPLACEMENT_CHAR)
|
74
|
+
|
75
|
+
starting_char << following_chars
|
76
|
+
end
|
77
|
+
|
43
78
|
def anchor_or_uuid(node = nil)
|
44
79
|
uuid = UUIDTools::UUID.random_create
|
45
80
|
node.nil? || node.id.nil? || node.id.empty? ? "_#{uuid}" : node.id
|
@@ -69,7 +104,7 @@ module Metanorma
|
|
69
104
|
cjk2 = /#{CJK}/o.match?(nextfirst)
|
70
105
|
text1 = /[^\p{Z}\p{C}]/.match?(last)
|
71
106
|
text2 = /[^\p{Z}\p{C}]/.match?(nextfirst)
|
72
|
-
|
107
|
+
cjk1 && (cjk2 || !text2) and next
|
73
108
|
!text1 && cjk2 and next
|
74
109
|
ret[i] += " "
|
75
110
|
end
|
@@ -125,10 +160,18 @@ module Metanorma
|
|
125
160
|
end
|
126
161
|
|
127
162
|
# all element/attribute pairs that are ID anchors in Metanorma
|
128
|
-
def anchor_attributes
|
129
|
-
[%w
|
130
|
-
|
131
|
-
|
163
|
+
def anchor_attributes(presxml: false)
|
164
|
+
ret = [%w(review from), %w(review to), %w(callout target), %w(xref to),
|
165
|
+
%w(eref bibitemid), %w(citation bibitemid), %w(xref target),
|
166
|
+
%w(label for), %w(location target), %w(index to),
|
167
|
+
%w(termsource bibitemid), %w(admonition target)]
|
168
|
+
ret1 = [%w(fn target), %w(semx source), %w(fmt-title source),
|
169
|
+
%w(fmt-xref to), %w(fmt-xref target), %w(fmt-eref bibitemid),
|
170
|
+
%w(fmt-xref-label container), %w(fmt-fn-body target),
|
171
|
+
%w(fmt-review-start source), %w(fmt-review-start end),
|
172
|
+
%w(fmt-review-start target), %w(fmt-review-end source),
|
173
|
+
%w(fmt-review-end start), %w(fmt-review-end target)]
|
174
|
+
presxml ? ret + ret1 : ret
|
132
175
|
end
|
133
176
|
|
134
177
|
# convert definition list term/value pair into Nokogiri XML attribute
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metanorma-utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.11.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-05-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: asciidoctor
|
@@ -321,6 +321,7 @@ files:
|
|
321
321
|
- README.adoc
|
322
322
|
- lib/metanorma-utils.rb
|
323
323
|
- lib/sterile/sterile.rb
|
324
|
+
- lib/utils/cjk.rb
|
324
325
|
- lib/utils/hash_transform_keys.rb
|
325
326
|
- lib/utils/image.rb
|
326
327
|
- lib/utils/linestatus.rb
|