@shd101wyy/yo 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -6
- package/out/cjs/index.cjs +546 -535
- package/out/cjs/yo-cli.cjs +636 -623
- package/out/esm/index.mjs +478 -467
- package/out/types/src/codegen/codegen-c.d.ts +2 -0
- package/out/types/src/codegen/functions/context.d.ts +1 -0
- package/out/types/src/codegen/functions/generation.d.ts +10 -0
- package/out/types/src/codegen/utils/index.d.ts +1 -0
- package/out/types/src/env.d.ts +12 -1
- package/out/types/src/evaluator/builtins/build.d.ts +1 -0
- package/out/types/src/evaluator/context.d.ts +1 -0
- package/out/types/src/evaluator/types/enum.d.ts +1 -1
- package/out/types/src/evaluator/types/synthesizer.d.ts +6 -1
- package/out/types/src/expr.d.ts +2 -0
- package/out/types/src/target.d.ts +1 -0
- package/out/types/src/types/compatibility.d.ts +1 -1
- package/out/types/src/types/creators.d.ts +2 -1
- package/out/types/src/types/definitions.d.ts +11 -0
- package/out/types/src/types/guards.d.ts +2 -1
- package/out/types/src/types/tags.d.ts +2 -1
- package/out/types/src/value.d.ts +2 -1
- package/out/types/tsconfig.tsbuildinfo +1 -1
- package/package.json +1 -1
- package/std/build.yo +2 -1
- package/std/collections/array_list.yo +133 -1
- package/std/encoding/html.yo +283 -0
- package/std/encoding/html_char_utils.yo +36 -0
- package/std/encoding/html_entities.yo +2262 -0
- package/std/encoding/punycode.yo +366 -0
- package/std/fmt/to_string.yo +5 -4
- package/std/glob/index.yo +2 -2
- package/std/libc/wctype.yo +55 -0
- package/std/path.yo +6 -6
- package/std/prelude.yo +193 -7
- package/std/regex/parser.yo +69 -4
- package/std/regex/vm.yo +18 -31
- package/std/string/string.yo +1388 -1337
- package/std/string/unicode.yo +242 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
// Unicode-aware case conversion
|
|
2
|
+
//
|
|
3
|
+
// Provides Unicode-aware lowercase/uppercase conversion using C's towlower/towupper,
|
|
4
|
+
// plus hand-coded tables for special case folding entries where one codepoint
|
|
5
|
+
// maps to multiple codepoints.
|
|
6
|
+
//
|
|
7
|
+
// Example:
|
|
8
|
+
// { unicode_to_lowercase, unicode_to_uppercase } :: import "std/string/unicode";
|
|
9
|
+
//
|
|
10
|
+
// lower := unicode_to_lowercase(`HELLO WÖRLD`); // "hello wörld"
|
|
11
|
+
// upper := unicode_to_uppercase(`hello wörld`); // "HELLO WÖRLD"
|
|
12
|
+
|
|
13
|
+
open import "../string";
|
|
14
|
+
{ ArrayList } :: import "../collections/array_list";
|
|
15
|
+
|
|
16
|
+
// Declare towlower/towupper directly with i32 to avoid wint_t cast issues.
|
|
17
|
+
// On all major platforms, wint_t is compatible with int/i32.
|
|
18
|
+
c_include "<wctype.h>",
|
|
19
|
+
towlower :
|
|
20
|
+
fn(wc : i32) -> i32,
|
|
21
|
+
towupper :
|
|
22
|
+
fn(wc : i32) -> i32
|
|
23
|
+
;
|
|
24
|
+
|
|
25
|
+
// Result of decoding a single UTF-8 codepoint
|
|
26
|
+
_DecodeResult :: struct(
|
|
27
|
+
codepoint : i32,
|
|
28
|
+
bytes_consumed : usize
|
|
29
|
+
);
|
|
30
|
+
|
|
31
|
+
// Decode a single UTF-8 codepoint from bytes at position i.
|
|
32
|
+
_decode_utf8 :: (fn(bytes: ArrayList(u8), i: usize) -> _DecodeResult)({
|
|
33
|
+
(b0 : i32) = i32(bytes.get(i).unwrap());
|
|
34
|
+
cond(
|
|
35
|
+
// 1-byte ASCII
|
|
36
|
+
((b0 & i32(0x80)) == i32(0)) => {
|
|
37
|
+
_DecodeResult(codepoint: b0, bytes_consumed: usize(1))
|
|
38
|
+
},
|
|
39
|
+
// 2-byte
|
|
40
|
+
((b0 & i32(0xE0)) == i32(0xC0)) => {
|
|
41
|
+
(b1 : i32) = i32(bytes.get((i + usize(1))).unwrap());
|
|
42
|
+
(cp : i32) = (((b0 & i32(0x1F)) << i32(6)) | (b1 & i32(0x3F)));
|
|
43
|
+
_DecodeResult(codepoint: cp, bytes_consumed: usize(2))
|
|
44
|
+
},
|
|
45
|
+
// 3-byte
|
|
46
|
+
((b0 & i32(0xF0)) == i32(0xE0)) => {
|
|
47
|
+
(b1 : i32) = i32(bytes.get((i + usize(1))).unwrap());
|
|
48
|
+
(b2 : i32) = i32(bytes.get((i + usize(2))).unwrap());
|
|
49
|
+
(cp : i32) = ((((b0 & i32(0x0F)) << i32(12)) | ((b1 & i32(0x3F)) << i32(6))) | (b2 & i32(0x3F)));
|
|
50
|
+
_DecodeResult(codepoint: cp, bytes_consumed: usize(3))
|
|
51
|
+
},
|
|
52
|
+
// 4-byte
|
|
53
|
+
true => {
|
|
54
|
+
(b1 : i32) = i32(bytes.get((i + usize(1))).unwrap());
|
|
55
|
+
(b2 : i32) = i32(bytes.get((i + usize(2))).unwrap());
|
|
56
|
+
(b3 : i32) = i32(bytes.get((i + usize(3))).unwrap());
|
|
57
|
+
(cp : i32) = (((((b0 & i32(0x07)) << i32(18)) | ((b1 & i32(0x3F)) << i32(12))) | ((b2 & i32(0x3F)) << i32(6))) | (b3 & i32(0x3F)));
|
|
58
|
+
_DecodeResult(codepoint: cp, bytes_consumed: usize(4))
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
// Encode a Unicode codepoint as UTF-8 bytes into an ArrayList.
|
|
64
|
+
_encode_utf8 :: (fn(cp: i32, out: *(ArrayList(u8))) -> unit)({
|
|
65
|
+
cond(
|
|
66
|
+
(cp < i32(0x80)) => {
|
|
67
|
+
out.*.push(u8(cp));
|
|
68
|
+
},
|
|
69
|
+
(cp < i32(0x800)) => {
|
|
70
|
+
out.*.push(u8((i32(0xC0) | (cp >> i32(6)))));
|
|
71
|
+
out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
|
|
72
|
+
},
|
|
73
|
+
(cp < i32(0x10000)) => {
|
|
74
|
+
out.*.push(u8((i32(0xE0) | (cp >> i32(12)))));
|
|
75
|
+
out.*.push(u8((i32(0x80) | ((cp >> i32(6)) & i32(0x3F)))));
|
|
76
|
+
out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
|
|
77
|
+
},
|
|
78
|
+
true => {
|
|
79
|
+
out.*.push(u8((i32(0xF0) | (cp >> i32(18)))));
|
|
80
|
+
out.*.push(u8((i32(0x80) | ((cp >> i32(12)) & i32(0x3F)))));
|
|
81
|
+
out.*.push(u8((i32(0x80) | ((cp >> i32(6)) & i32(0x3F)))));
|
|
82
|
+
out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
|
|
83
|
+
}
|
|
84
|
+
);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
// Special case folding: codepoints that expand to multiple codepoints
|
|
88
|
+
// when lowercased. These are Unicode case folding entries of type 'F' (full).
|
|
89
|
+
_special_to_lower :: (fn(cp: i32, out: *(ArrayList(u8))) -> bool)({
|
|
90
|
+
cond(
|
|
91
|
+
// ẞ (U+1E9E LATIN CAPITAL LETTER SHARP S) → ss
|
|
92
|
+
(cp == i32(0x1E9E)) => {
|
|
93
|
+
out.*.push(u8(0x73)); // s
|
|
94
|
+
out.*.push(u8(0x73)); // s
|
|
95
|
+
true
|
|
96
|
+
},
|
|
97
|
+
// İ (U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE) → i + combining dot above
|
|
98
|
+
(cp == i32(0x0130)) => {
|
|
99
|
+
out.*.push(u8(0x69)); // i
|
|
100
|
+
// U+0307 COMBINING DOT ABOVE
|
|
101
|
+
_encode_utf8(i32(0x0307), out);
|
|
102
|
+
true
|
|
103
|
+
},
|
|
104
|
+
true => false
|
|
105
|
+
)
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
// Special case folding: codepoints that expand to multiple codepoints
|
|
109
|
+
// when uppercased.
|
|
110
|
+
_special_to_upper :: (fn(cp: i32, out: *(ArrayList(u8))) -> bool)({
|
|
111
|
+
cond(
|
|
112
|
+
// ß (U+00DF LATIN SMALL LETTER SHARP S) → SS
|
|
113
|
+
(cp == i32(0x00DF)) => {
|
|
114
|
+
out.*.push(u8(0x53)); // S
|
|
115
|
+
out.*.push(u8(0x53)); // S
|
|
116
|
+
true
|
|
117
|
+
},
|
|
118
|
+
// ff (U+FB00 LATIN SMALL LIGATURE FF) → FF
|
|
119
|
+
(cp == i32(0xFB00)) => {
|
|
120
|
+
out.*.push(u8(0x46)); // F
|
|
121
|
+
out.*.push(u8(0x46)); // F
|
|
122
|
+
true
|
|
123
|
+
},
|
|
124
|
+
// fi (U+FB01 LATIN SMALL LIGATURE FI) → FI
|
|
125
|
+
(cp == i32(0xFB01)) => {
|
|
126
|
+
out.*.push(u8(0x46)); // F
|
|
127
|
+
out.*.push(u8(0x49)); // I
|
|
128
|
+
true
|
|
129
|
+
},
|
|
130
|
+
// fl (U+FB02 LATIN SMALL LIGATURE FL) → FL
|
|
131
|
+
(cp == i32(0xFB02)) => {
|
|
132
|
+
out.*.push(u8(0x46)); // F
|
|
133
|
+
out.*.push(u8(0x4C)); // L
|
|
134
|
+
true
|
|
135
|
+
},
|
|
136
|
+
// ffi (U+FB03 LATIN SMALL LIGATURE FFI) → FFI
|
|
137
|
+
(cp == i32(0xFB03)) => {
|
|
138
|
+
out.*.push(u8(0x46)); // F
|
|
139
|
+
out.*.push(u8(0x46)); // F
|
|
140
|
+
out.*.push(u8(0x49)); // I
|
|
141
|
+
true
|
|
142
|
+
},
|
|
143
|
+
// ffl (U+FB04 LATIN SMALL LIGATURE FFL) → FFL
|
|
144
|
+
(cp == i32(0xFB04)) => {
|
|
145
|
+
out.*.push(u8(0x46)); // F
|
|
146
|
+
out.*.push(u8(0x46)); // F
|
|
147
|
+
out.*.push(u8(0x4C)); // L
|
|
148
|
+
true
|
|
149
|
+
},
|
|
150
|
+
// ſt (U+FB05 LATIN SMALL LIGATURE LONG S T) → ST
|
|
151
|
+
(cp == i32(0xFB05)) => {
|
|
152
|
+
out.*.push(u8(0x53)); // S
|
|
153
|
+
out.*.push(u8(0x54)); // T
|
|
154
|
+
true
|
|
155
|
+
},
|
|
156
|
+
// st (U+FB06 LATIN SMALL LIGATURE ST) → ST
|
|
157
|
+
(cp == i32(0xFB06)) => {
|
|
158
|
+
out.*.push(u8(0x53)); // S
|
|
159
|
+
out.*.push(u8(0x54)); // T
|
|
160
|
+
true
|
|
161
|
+
},
|
|
162
|
+
true => false
|
|
163
|
+
)
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
// Convert a String to lowercase using Unicode-aware case mapping.
|
|
167
|
+
// Handles both ASCII and non-ASCII codepoints via C's towlower,
|
|
168
|
+
// plus special multi-char expansions (e.g., ẞ → ss).
|
|
169
|
+
unicode_to_lowercase :: (fn(input: String) -> String)({
|
|
170
|
+
(bytes : ArrayList(u8)) = input.as_bytes();
|
|
171
|
+
(out : ArrayList(u8)) = ArrayList(u8).with_capacity(bytes.len());
|
|
172
|
+
(i : usize) = usize(0);
|
|
173
|
+
|
|
174
|
+
while (i < bytes.len()), {
|
|
175
|
+
(b0 : i32) = i32(bytes.get(i).unwrap());
|
|
176
|
+
// Fast path for ASCII
|
|
177
|
+
if(((b0 & i32(0x80)) == i32(0)), {
|
|
178
|
+
if(((b0 >= i32(0x41)) && (b0 <= i32(0x5A))), {
|
|
179
|
+
out.push(u8((b0 + i32(0x20))));
|
|
180
|
+
}, {
|
|
181
|
+
out.push(u8(b0));
|
|
182
|
+
});
|
|
183
|
+
i = (i + usize(1));
|
|
184
|
+
}, {
|
|
185
|
+
// Multi-byte UTF-8: decode codepoint
|
|
186
|
+
(result : _DecodeResult) = _decode_utf8(bytes, i);
|
|
187
|
+
(cp : i32) = result.codepoint;
|
|
188
|
+
(len : usize) = result.bytes_consumed;
|
|
189
|
+
|
|
190
|
+
// Try special case folding first
|
|
191
|
+
if(!(_special_to_lower(cp, (&out))), {
|
|
192
|
+
// Use C towlower for standard Unicode lowercase
|
|
193
|
+
(lower : i32) = i32(towlower(cp));
|
|
194
|
+
_encode_utf8(lower, (&out));
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
i = (i + len);
|
|
198
|
+
});
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
String.from_bytes(out)
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
// Convert a String to uppercase using Unicode-aware case mapping.
|
|
205
|
+
// Handles both ASCII and non-ASCII codepoints via C's towupper,
|
|
206
|
+
// plus special multi-char expansions (e.g., ß → SS, ligatures).
|
|
207
|
+
unicode_to_uppercase :: (fn(input: String) -> String)({
|
|
208
|
+
(bytes : ArrayList(u8)) = input.as_bytes();
|
|
209
|
+
(out : ArrayList(u8)) = ArrayList(u8).with_capacity(bytes.len());
|
|
210
|
+
(i : usize) = usize(0);
|
|
211
|
+
|
|
212
|
+
while (i < bytes.len()), {
|
|
213
|
+
(b0 : i32) = i32(bytes.get(i).unwrap());
|
|
214
|
+
// Fast path for ASCII
|
|
215
|
+
if(((b0 & i32(0x80)) == i32(0)), {
|
|
216
|
+
if(((b0 >= i32(0x61)) && (b0 <= i32(0x7A))), {
|
|
217
|
+
out.push(u8((b0 - i32(0x20))));
|
|
218
|
+
}, {
|
|
219
|
+
out.push(u8(b0));
|
|
220
|
+
});
|
|
221
|
+
i = (i + usize(1));
|
|
222
|
+
}, {
|
|
223
|
+
// Multi-byte UTF-8: decode codepoint
|
|
224
|
+
(result : _DecodeResult) = _decode_utf8(bytes, i);
|
|
225
|
+
(cp : i32) = result.codepoint;
|
|
226
|
+
(len : usize) = result.bytes_consumed;
|
|
227
|
+
|
|
228
|
+
// Try special case folding first
|
|
229
|
+
if(!(_special_to_upper(cp, (&out))), {
|
|
230
|
+
// Use C towupper for standard Unicode uppercase
|
|
231
|
+
(upper : i32) = i32(towupper(cp));
|
|
232
|
+
_encode_utf8(upper, (&out));
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
i = (i + len);
|
|
236
|
+
});
|
|
237
|
+
};
|
|
238
|
+
|
|
239
|
+
String.from_bytes(out)
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
export unicode_to_lowercase, unicode_to_uppercase;
|