@shd101wyy/yo 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +7 -6
  2. package/out/cjs/index.cjs +546 -535
  3. package/out/cjs/yo-cli.cjs +636 -623
  4. package/out/esm/index.mjs +478 -467
  5. package/out/types/src/codegen/codegen-c.d.ts +2 -0
  6. package/out/types/src/codegen/functions/context.d.ts +1 -0
  7. package/out/types/src/codegen/functions/generation.d.ts +10 -0
  8. package/out/types/src/codegen/utils/index.d.ts +1 -0
  9. package/out/types/src/env.d.ts +12 -1
  10. package/out/types/src/evaluator/builtins/build.d.ts +1 -0
  11. package/out/types/src/evaluator/context.d.ts +1 -0
  12. package/out/types/src/evaluator/types/enum.d.ts +1 -1
  13. package/out/types/src/evaluator/types/synthesizer.d.ts +6 -1
  14. package/out/types/src/expr.d.ts +2 -0
  15. package/out/types/src/target.d.ts +1 -0
  16. package/out/types/src/types/compatibility.d.ts +1 -1
  17. package/out/types/src/types/creators.d.ts +2 -1
  18. package/out/types/src/types/definitions.d.ts +11 -0
  19. package/out/types/src/types/guards.d.ts +2 -1
  20. package/out/types/src/types/tags.d.ts +2 -1
  21. package/out/types/src/value.d.ts +2 -1
  22. package/out/types/tsconfig.tsbuildinfo +1 -1
  23. package/package.json +1 -1
  24. package/std/build.yo +2 -1
  25. package/std/collections/array_list.yo +133 -1
  26. package/std/encoding/html.yo +283 -0
  27. package/std/encoding/html_char_utils.yo +36 -0
  28. package/std/encoding/html_entities.yo +2262 -0
  29. package/std/encoding/punycode.yo +366 -0
  30. package/std/fmt/to_string.yo +5 -4
  31. package/std/glob/index.yo +2 -2
  32. package/std/libc/wctype.yo +55 -0
  33. package/std/path.yo +6 -6
  34. package/std/prelude.yo +193 -7
  35. package/std/regex/parser.yo +69 -4
  36. package/std/regex/vm.yo +18 -31
  37. package/std/string/string.yo +1388 -1337
  38. package/std/string/unicode.yo +242 -0
@@ -0,0 +1,242 @@
1
+ // Unicode-aware case conversion
2
+ //
3
+ // Provides Unicode-aware lowercase/uppercase conversion using C's towlower/towupper,
4
+ // plus hand-coded tables for special case folding entries where one codepoint
5
+ // maps to multiple codepoints.
6
+ //
7
+ // Example:
8
+ // { unicode_to_lowercase, unicode_to_uppercase } :: import "std/string/unicode";
9
+ //
10
+ // lower := unicode_to_lowercase(`HELLO WÖRLD`); // "hello wörld"
11
+ // upper := unicode_to_uppercase(`hello wörld`); // "HELLO WÖRLD"
12
+
13
+ open import "../string";
14
+ { ArrayList } :: import "../collections/array_list";
15
+
16
+ // Declare towlower/towupper directly with i32 to avoid wint_t cast issues.
17
+ // On all major platforms, wint_t is compatible with int/i32.
18
+ c_include "<wctype.h>",
19
+ towlower :
20
+ fn(wc : i32) -> i32,
21
+ towupper :
22
+ fn(wc : i32) -> i32
23
+ ;
24
+
25
+ // Result of decoding a single UTF-8 codepoint
26
+ _DecodeResult :: struct(
27
+ codepoint : i32,
28
+ bytes_consumed : usize
29
+ );
30
+
31
+ // Decode a single UTF-8 codepoint from bytes at position i.
32
+ _decode_utf8 :: (fn(bytes: ArrayList(u8), i: usize) -> _DecodeResult)({
33
+ (b0 : i32) = i32(bytes.get(i).unwrap());
34
+ cond(
35
+ // 1-byte ASCII
36
+ ((b0 & i32(0x80)) == i32(0)) => {
37
+ _DecodeResult(codepoint: b0, bytes_consumed: usize(1))
38
+ },
39
+ // 2-byte
40
+ ((b0 & i32(0xE0)) == i32(0xC0)) => {
41
+ (b1 : i32) = i32(bytes.get((i + usize(1))).unwrap());
42
+ (cp : i32) = (((b0 & i32(0x1F)) << i32(6)) | (b1 & i32(0x3F)));
43
+ _DecodeResult(codepoint: cp, bytes_consumed: usize(2))
44
+ },
45
+ // 3-byte
46
+ ((b0 & i32(0xF0)) == i32(0xE0)) => {
47
+ (b1 : i32) = i32(bytes.get((i + usize(1))).unwrap());
48
+ (b2 : i32) = i32(bytes.get((i + usize(2))).unwrap());
49
+ (cp : i32) = ((((b0 & i32(0x0F)) << i32(12)) | ((b1 & i32(0x3F)) << i32(6))) | (b2 & i32(0x3F)));
50
+ _DecodeResult(codepoint: cp, bytes_consumed: usize(3))
51
+ },
52
+ // 4-byte
53
+ true => {
54
+ (b1 : i32) = i32(bytes.get((i + usize(1))).unwrap());
55
+ (b2 : i32) = i32(bytes.get((i + usize(2))).unwrap());
56
+ (b3 : i32) = i32(bytes.get((i + usize(3))).unwrap());
57
+ (cp : i32) = (((((b0 & i32(0x07)) << i32(18)) | ((b1 & i32(0x3F)) << i32(12))) | ((b2 & i32(0x3F)) << i32(6))) | (b3 & i32(0x3F)));
58
+ _DecodeResult(codepoint: cp, bytes_consumed: usize(4))
59
+ }
60
+ )
61
+ });
62
+
63
+ // Encode a Unicode codepoint as UTF-8 bytes into an ArrayList.
64
+ _encode_utf8 :: (fn(cp: i32, out: *(ArrayList(u8))) -> unit)({
65
+ cond(
66
+ (cp < i32(0x80)) => {
67
+ out.*.push(u8(cp));
68
+ },
69
+ (cp < i32(0x800)) => {
70
+ out.*.push(u8((i32(0xC0) | (cp >> i32(6)))));
71
+ out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
72
+ },
73
+ (cp < i32(0x10000)) => {
74
+ out.*.push(u8((i32(0xE0) | (cp >> i32(12)))));
75
+ out.*.push(u8((i32(0x80) | ((cp >> i32(6)) & i32(0x3F)))));
76
+ out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
77
+ },
78
+ true => {
79
+ out.*.push(u8((i32(0xF0) | (cp >> i32(18)))));
80
+ out.*.push(u8((i32(0x80) | ((cp >> i32(12)) & i32(0x3F)))));
81
+ out.*.push(u8((i32(0x80) | ((cp >> i32(6)) & i32(0x3F)))));
82
+ out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
83
+ }
84
+ );
85
+ });
86
+
87
+ // Special case folding: codepoints that expand to multiple codepoints
88
+ // when lowercased. These are Unicode case folding entries of type 'F' (full).
89
+ _special_to_lower :: (fn(cp: i32, out: *(ArrayList(u8))) -> bool)({
90
+ cond(
91
+ // ẞ (U+1E9E LATIN CAPITAL LETTER SHARP S) → ss
92
+ (cp == i32(0x1E9E)) => {
93
+ out.*.push(u8(0x73)); // s
94
+ out.*.push(u8(0x73)); // s
95
+ true
96
+ },
97
+ // İ (U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE) → i + combining dot above
98
+ (cp == i32(0x0130)) => {
99
+ out.*.push(u8(0x69)); // i
100
+ // U+0307 COMBINING DOT ABOVE
101
+ _encode_utf8(i32(0x0307), out);
102
+ true
103
+ },
104
+ true => false
105
+ )
106
+ });
107
+
108
+ // Special case folding: codepoints that expand to multiple codepoints
109
+ // when uppercased.
110
+ _special_to_upper :: (fn(cp: i32, out: *(ArrayList(u8))) -> bool)({
111
+ cond(
112
+ // ß (U+00DF LATIN SMALL LETTER SHARP S) → SS
113
+ (cp == i32(0x00DF)) => {
114
+ out.*.push(u8(0x53)); // S
115
+ out.*.push(u8(0x53)); // S
116
+ true
117
+ },
118
+ // ff (U+FB00 LATIN SMALL LIGATURE FF) → FF
119
+ (cp == i32(0xFB00)) => {
120
+ out.*.push(u8(0x46)); // F
121
+ out.*.push(u8(0x46)); // F
122
+ true
123
+ },
124
+ // fi (U+FB01 LATIN SMALL LIGATURE FI) → FI
125
+ (cp == i32(0xFB01)) => {
126
+ out.*.push(u8(0x46)); // F
127
+ out.*.push(u8(0x49)); // I
128
+ true
129
+ },
130
+ // fl (U+FB02 LATIN SMALL LIGATURE FL) → FL
131
+ (cp == i32(0xFB02)) => {
132
+ out.*.push(u8(0x46)); // F
133
+ out.*.push(u8(0x4C)); // L
134
+ true
135
+ },
136
+ // ffi (U+FB03 LATIN SMALL LIGATURE FFI) → FFI
137
+ (cp == i32(0xFB03)) => {
138
+ out.*.push(u8(0x46)); // F
139
+ out.*.push(u8(0x46)); // F
140
+ out.*.push(u8(0x49)); // I
141
+ true
142
+ },
143
+ // ffl (U+FB04 LATIN SMALL LIGATURE FFL) → FFL
144
+ (cp == i32(0xFB04)) => {
145
+ out.*.push(u8(0x46)); // F
146
+ out.*.push(u8(0x46)); // F
147
+ out.*.push(u8(0x4C)); // L
148
+ true
149
+ },
150
+ // ſt (U+FB05 LATIN SMALL LIGATURE LONG S T) → ST
151
+ (cp == i32(0xFB05)) => {
152
+ out.*.push(u8(0x53)); // S
153
+ out.*.push(u8(0x54)); // T
154
+ true
155
+ },
156
+ // st (U+FB06 LATIN SMALL LIGATURE ST) → ST
157
+ (cp == i32(0xFB06)) => {
158
+ out.*.push(u8(0x53)); // S
159
+ out.*.push(u8(0x54)); // T
160
+ true
161
+ },
162
+ true => false
163
+ )
164
+ });
165
+
166
+ // Convert a String to lowercase using Unicode-aware case mapping.
167
+ // Handles both ASCII and non-ASCII codepoints via C's towlower,
168
+ // plus special multi-char expansions (e.g., ẞ → ss).
169
+ unicode_to_lowercase :: (fn(input: String) -> String)({
170
+ (bytes : ArrayList(u8)) = input.as_bytes();
171
+ (out : ArrayList(u8)) = ArrayList(u8).with_capacity(bytes.len());
172
+ (i : usize) = usize(0);
173
+
174
+ while (i < bytes.len()), {
175
+ (b0 : i32) = i32(bytes.get(i).unwrap());
176
+ // Fast path for ASCII
177
+ if(((b0 & i32(0x80)) == i32(0)), {
178
+ if(((b0 >= i32(0x41)) && (b0 <= i32(0x5A))), {
179
+ out.push(u8((b0 + i32(0x20))));
180
+ }, {
181
+ out.push(u8(b0));
182
+ });
183
+ i = (i + usize(1));
184
+ }, {
185
+ // Multi-byte UTF-8: decode codepoint
186
+ (result : _DecodeResult) = _decode_utf8(bytes, i);
187
+ (cp : i32) = result.codepoint;
188
+ (len : usize) = result.bytes_consumed;
189
+
190
+ // Try special case folding first
191
+ if(!(_special_to_lower(cp, (&out))), {
192
+ // Use C towlower for standard Unicode lowercase
193
+ (lower : i32) = i32(towlower(cp));
194
+ _encode_utf8(lower, (&out));
195
+ });
196
+
197
+ i = (i + len);
198
+ });
199
+ };
200
+
201
+ String.from_bytes(out)
202
+ });
203
+
204
+ // Convert a String to uppercase using Unicode-aware case mapping.
205
+ // Handles both ASCII and non-ASCII codepoints via C's towupper,
206
+ // plus special multi-char expansions (e.g., ß → SS, ligatures).
207
+ unicode_to_uppercase :: (fn(input: String) -> String)({
208
+ (bytes : ArrayList(u8)) = input.as_bytes();
209
+ (out : ArrayList(u8)) = ArrayList(u8).with_capacity(bytes.len());
210
+ (i : usize) = usize(0);
211
+
212
+ while (i < bytes.len()), {
213
+ (b0 : i32) = i32(bytes.get(i).unwrap());
214
+ // Fast path for ASCII
215
+ if(((b0 & i32(0x80)) == i32(0)), {
216
+ if(((b0 >= i32(0x61)) && (b0 <= i32(0x7A))), {
217
+ out.push(u8((b0 - i32(0x20))));
218
+ }, {
219
+ out.push(u8(b0));
220
+ });
221
+ i = (i + usize(1));
222
+ }, {
223
+ // Multi-byte UTF-8: decode codepoint
224
+ (result : _DecodeResult) = _decode_utf8(bytes, i);
225
+ (cp : i32) = result.codepoint;
226
+ (len : usize) = result.bytes_consumed;
227
+
228
+ // Try special case folding first
229
+ if(!(_special_to_upper(cp, (&out))), {
230
+ // Use C towupper for standard Unicode uppercase
231
+ (upper : i32) = i32(towupper(cp));
232
+ _encode_utf8(upper, (&out));
233
+ });
234
+
235
+ i = (i + len);
236
+ });
237
+ };
238
+
239
+ String.from_bytes(out)
240
+ });
241
+
242
+ export unicode_to_lowercase, unicode_to_uppercase;