@shd101wyy/yo 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -6
- package/out/cjs/index.cjs +691 -636
- package/out/cjs/yo-cli.cjs +710 -653
- package/out/esm/index.mjs +649 -594
- package/out/types/src/build-runner.d.ts +1 -1
- package/out/types/src/codegen/async/runtime-io-common.d.ts +2 -1
- package/out/types/src/codegen/async/runtime.d.ts +5 -1
- package/out/types/src/codegen/codegen-c.d.ts +2 -0
- package/out/types/src/codegen/functions/collection.d.ts +1 -1
- package/out/types/src/codegen/functions/context.d.ts +1 -0
- package/out/types/src/codegen/functions/generation.d.ts +10 -0
- package/out/types/src/codegen/utils/index.d.ts +4 -0
- package/out/types/src/env.d.ts +1 -0
- package/out/types/src/evaluator/builtins/build.d.ts +1 -0
- package/out/types/src/evaluator/builtins/comptime-index-fns.d.ts +17 -0
- package/out/types/src/evaluator/calls/index-trait.d.ts +17 -0
- package/out/types/src/evaluator/context.d.ts +19 -14
- package/out/types/src/evaluator/index.d.ts +3 -1
- package/out/types/src/evaluator/trait-checking.d.ts +1 -0
- package/out/types/src/evaluator/values/anonymous-module.d.ts +3 -2
- package/out/types/src/expr.d.ts +22 -1
- package/out/types/src/module-manager.d.ts +1 -0
- package/out/types/src/target.d.ts +1 -0
- package/out/types/src/value.d.ts +4 -1
- package/out/types/tsconfig.tsbuildinfo +1 -1
- package/package.json +1 -1
- package/std/build.yo +2 -1
- package/std/collections/array_list.yo +114 -26
- package/std/collections/btree_map.yo +13 -3
- package/std/collections/deque.yo +10 -0
- package/std/collections/hash_map.yo +15 -0
- package/std/collections/priority_queue.yo +5 -5
- package/std/encoding/html.yo +283 -0
- package/std/encoding/html_char_utils.yo +36 -0
- package/std/encoding/html_entities.yo +2262 -0
- package/std/encoding/punycode.yo +366 -0
- package/std/encoding/toml.yo +1 -1
- package/std/fmt/to_string.yo +5 -4
- package/std/glob/index.yo +2 -2
- package/std/libc/wctype.yo +55 -0
- package/std/path.yo +6 -6
- package/std/prelude.yo +826 -205
- package/std/process.yo +1 -1
- package/std/regex/compiler.yo +11 -11
- package/std/regex/index.yo +2 -4
- package/std/regex/parser.yo +69 -4
- package/std/regex/vm.yo +53 -46
- package/std/string/string.yo +1424 -1339
- package/std/string/unicode.yo +242 -0
- package/out/types/src/evaluator/calls/array.d.ts +0 -14
package/package.json
CHANGED
package/std/build.yo
CHANGED
|
@@ -57,6 +57,7 @@ CompilationTarget :: {
|
|
|
57
57
|
X86_64_Linux_Gnu: "x86_64-linux-gnu",
|
|
58
58
|
X86_64_Linux_Musl: "x86_64-linux-musl",
|
|
59
59
|
Aarch64_Linux_Gnu: "aarch64-linux-gnu",
|
|
60
|
+
Aarch64_Linux_Musl: "aarch64-linux-musl",
|
|
60
61
|
Aarch64_Macos: "aarch64-macos",
|
|
61
62
|
X86_64_Macos: "x86_64-macos",
|
|
62
63
|
X86_64_Windows_Msvc: "x86_64-windows-msvc",
|
|
@@ -80,7 +81,7 @@ Executable :: struct(
|
|
|
80
81
|
root : comptime_string,
|
|
81
82
|
(target : comptime_string) ?= __yo_build_target_host(),
|
|
82
83
|
(optimize : Optimize) ?= Optimize.Debug,
|
|
83
|
-
(allocator : Allocator) ?= Allocator.
|
|
84
|
+
(allocator : Allocator) ?= Allocator.Libc,
|
|
84
85
|
(sanitize : Sanitize) ?= Sanitize.None
|
|
85
86
|
);
|
|
86
87
|
export Executable;
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* - RAII for automatic cleanup
|
|
9
9
|
*/
|
|
10
10
|
{ GlobalAllocator, AllocError } :: import "../allocator.yo";
|
|
11
|
-
{ memmove } :: import "../libc/string.yo";
|
|
11
|
+
{ memmove, memcpy, memset } :: import "../libc/string.yo";
|
|
12
12
|
{ malloc, calloc, realloc, free, aligned_alloc } :: GlobalAllocator;
|
|
13
13
|
|
|
14
14
|
/**
|
|
@@ -195,26 +195,6 @@ impl(forall(T : Type), ArrayList(T),
|
|
|
195
195
|
)
|
|
196
196
|
),
|
|
197
197
|
|
|
198
|
-
/**
|
|
199
|
-
* Set an element at a specific index (bounds checked)
|
|
200
|
-
* Returns Ok(()) on success, or Error with bounds information
|
|
201
|
-
*/
|
|
202
|
-
set : (fn(self: Self, index: usize, value: T) -> Result(unit, ArrayListError))(
|
|
203
|
-
cond(
|
|
204
|
-
(index >= self._length) =>
|
|
205
|
-
.Err(.IndexOutOfBounds(index: index, length: self._length)),
|
|
206
|
-
true =>
|
|
207
|
-
match(self._ptr,
|
|
208
|
-
.None => panic("ArrayList has length but no ptr"),
|
|
209
|
-
.Some(_ptr) => {
|
|
210
|
-
target_ptr := (_ptr &+ index);
|
|
211
|
-
target_ptr.* = value;
|
|
212
|
-
.Ok(())
|
|
213
|
-
}
|
|
214
|
-
)
|
|
215
|
-
)
|
|
216
|
-
),
|
|
217
|
-
|
|
218
198
|
/**
|
|
219
199
|
* Shrink the capacity to match the current length
|
|
220
200
|
* This reduces memory usage but may cause reallocation on next push
|
|
@@ -262,11 +242,13 @@ impl(forall(T : Type), ArrayList(T),
|
|
|
262
242
|
_free_elements : (fn(self : Self) -> unit)(
|
|
263
243
|
cond(
|
|
264
244
|
Type.contains_rc_type(T) => {
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
245
|
+
if((self._length > usize(0)), {
|
|
246
|
+
i := usize(0);
|
|
247
|
+
base_ptr := self._ptr.unwrap();
|
|
248
|
+
while(i < self._length, i = (i + usize(1)), {
|
|
249
|
+
element_ptr := (base_ptr &+ i);
|
|
250
|
+
unsafe.drop(element_ptr.*);
|
|
251
|
+
});
|
|
270
252
|
});
|
|
271
253
|
},
|
|
272
254
|
true => ()
|
|
@@ -415,14 +397,120 @@ impl(forall(T : Type), ArrayList(T),
|
|
|
415
397
|
)
|
|
416
398
|
),
|
|
417
399
|
|
|
400
|
+
/**
|
|
401
|
+
* Ensure the ArrayList can hold at least `min_cap` total elements
|
|
402
|
+
* without further reallocation.
|
|
403
|
+
*/
|
|
404
|
+
ensure_total_capacity : (fn(self: Self, min_cap: usize) -> unit)({
|
|
405
|
+
cond(
|
|
406
|
+
(min_cap <= self._capacity) => (),
|
|
407
|
+
true => {
|
|
408
|
+
new_capacity := cond(
|
|
409
|
+
(self._capacity == usize(0)) => min_cap,
|
|
410
|
+
true => {
|
|
411
|
+
cap := self._capacity;
|
|
412
|
+
while (cap < min_cap), {
|
|
413
|
+
cap = (cap * usize(2));
|
|
414
|
+
};
|
|
415
|
+
cap
|
|
416
|
+
}
|
|
417
|
+
);
|
|
418
|
+
new_some_ptr := match(self._ptr,
|
|
419
|
+
.None => GlobalAllocator.malloc((sizeof(T) * new_capacity)),
|
|
420
|
+
.Some(old_ptr) => GlobalAllocator.realloc(
|
|
421
|
+
.Some((*(void))(old_ptr)),
|
|
422
|
+
(sizeof(T) * new_capacity)
|
|
423
|
+
)
|
|
424
|
+
);
|
|
425
|
+
match(new_some_ptr,
|
|
426
|
+
.Some(new_ptr) => {
|
|
427
|
+
self._ptr = .Some((*(T))(new_ptr));
|
|
428
|
+
self._capacity = new_capacity;
|
|
429
|
+
},
|
|
430
|
+
.None => panic("ArrayList.ensure_total_capacity: allocation failed")
|
|
431
|
+
);
|
|
432
|
+
}
|
|
433
|
+
);
|
|
434
|
+
}),
|
|
435
|
+
|
|
436
|
+
/**
|
|
437
|
+
* Append `count` elements from a raw pointer using memcpy.
|
|
438
|
+
* The caller must ensure `src` points to at least `count` valid elements.
|
|
439
|
+
*/
|
|
440
|
+
extend_from_ptr : (fn(self: Self, src: *(T), count: usize) -> unit)({
|
|
441
|
+
cond(
|
|
442
|
+
(count == usize(0)) => (),
|
|
443
|
+
true => {
|
|
444
|
+
self.ensure_total_capacity((self._length + count));
|
|
445
|
+
match(self._ptr,
|
|
446
|
+
.Some(dst_base) => {
|
|
447
|
+
dst := (*(void))((dst_base &+ self._length));
|
|
448
|
+
_ := memcpy(dst, (*(void))(src), (count * sizeof(T)));
|
|
449
|
+
self._length = (self._length + count);
|
|
450
|
+
},
|
|
451
|
+
.None => panic("ArrayList.extend_from_ptr: no ptr after ensure_total_capacity")
|
|
452
|
+
);
|
|
453
|
+
}
|
|
454
|
+
);
|
|
455
|
+
}),
|
|
456
|
+
|
|
418
457
|
/**
|
|
419
458
|
* Clear all elements but keep capacity
|
|
420
459
|
*/
|
|
421
460
|
clear : (fn(self: Self) -> unit)({
|
|
422
461
|
Self._free_elements(self);
|
|
423
462
|
self._length = usize(0);
|
|
463
|
+
}),
|
|
464
|
+
|
|
465
|
+
/**
|
|
466
|
+
* Fill all elements with a byte pattern using memset.
|
|
467
|
+
* Useful for zeroing bool/integer arrays in O(1).
|
|
468
|
+
* Only safe for types without RC (e.g., bool, u8, usize).
|
|
469
|
+
*/
|
|
470
|
+
fill_with_byte : (fn(self: Self, byte_val: int) -> unit)(
|
|
471
|
+
match(self._ptr,
|
|
472
|
+
.None => (),
|
|
473
|
+
.Some(_ptr) => {
|
|
474
|
+
_ := memset((*(void))(_ptr), byte_val, (self._length * sizeof(T)));
|
|
475
|
+
}
|
|
476
|
+
)
|
|
477
|
+
),
|
|
478
|
+
|
|
479
|
+
/**
|
|
480
|
+
* Resize ArrayList to exactly `new_len` elements, filling new slots with
|
|
481
|
+
* a byte pattern via memset. Does not call destructors on removed elements.
|
|
482
|
+
* Only safe for trivial types (bool, u8, usize, etc.).
|
|
483
|
+
*/
|
|
484
|
+
resize_with_byte : (fn(self: Self, new_len: usize, byte_val: int) -> unit)({
|
|
485
|
+
cond(
|
|
486
|
+
(new_len <= self._length) => {
|
|
487
|
+
self._length = new_len;
|
|
488
|
+
},
|
|
489
|
+
true => {
|
|
490
|
+
self.ensure_total_capacity(new_len);
|
|
491
|
+
match(self._ptr,
|
|
492
|
+
.Some(_ptr) => {
|
|
493
|
+
start := (*(void))((_ptr &+ self._length));
|
|
494
|
+
fill_count := ((new_len - self._length) * sizeof(T));
|
|
495
|
+
_ := memset(start, byte_val, fill_count);
|
|
496
|
+
self._length = new_len;
|
|
497
|
+
},
|
|
498
|
+
.None => panic("ArrayList.resize_with_byte: no ptr after ensure")
|
|
499
|
+
);
|
|
500
|
+
}
|
|
501
|
+
);
|
|
424
502
|
})
|
|
425
503
|
);
|
|
504
|
+
impl(forall(T : Type), ArrayList(T), Index(usize)(
|
|
505
|
+
Output : T,
|
|
506
|
+
index : (fn(self: *(Self), idx: usize) -> *(Self.Output))({
|
|
507
|
+
assert((idx < self.*._length), "ArrayList: index out of bounds");
|
|
508
|
+
match(self.*._ptr,
|
|
509
|
+
.Some(_ptr) => (_ptr &+ idx),
|
|
510
|
+
.None => panic("ArrayList: index on empty list")
|
|
511
|
+
)
|
|
512
|
+
})
|
|
513
|
+
));
|
|
426
514
|
impl(forall(T : Type), ArrayList(T), Dispose(
|
|
427
515
|
/**
|
|
428
516
|
* RAII destructor - automatically called when ArrayList goes out of scope
|
|
@@ -86,7 +86,7 @@ impl(forall(K : Type, V : Type), BTreeMap(K, V),
|
|
|
86
86
|
cond(
|
|
87
87
|
r.found => {
|
|
88
88
|
entry := self._entries.get(r.idx).unwrap();
|
|
89
|
-
self._entries
|
|
89
|
+
&(self._entries(r.idx)).* = BTreeEntry(K, V)(key: entry.key, value: v);
|
|
90
90
|
},
|
|
91
91
|
true => {
|
|
92
92
|
// Append new entry then bubble left to sorted position
|
|
@@ -95,8 +95,8 @@ impl(forall(K : Type, V : Type), BTreeMap(K, V),
|
|
|
95
95
|
while (pos > r.idx), (pos = (pos - usize(1))), {
|
|
96
96
|
curr := self._entries.get(pos).unwrap();
|
|
97
97
|
prev := self._entries.get((pos - usize(1))).unwrap();
|
|
98
|
-
self._entries
|
|
99
|
-
self._entries
|
|
98
|
+
&(self._entries(pos)).* = prev;
|
|
99
|
+
&(self._entries((pos - usize(1)))).* = curr;
|
|
100
100
|
};
|
|
101
101
|
}
|
|
102
102
|
);
|
|
@@ -246,6 +246,16 @@ impl(forall(K : Type, V : Type), BTreeMap(K, V),
|
|
|
246
246
|
)
|
|
247
247
|
);
|
|
248
248
|
|
|
249
|
+
impl(forall(K : Type, V : Type), BTreeMap(K, V), Index(K)(
|
|
250
|
+
Output : V,
|
|
251
|
+
index : (fn(self: *(Self), idx: K, where(K <: Ord(K))) -> *(Self.Output))({
|
|
252
|
+
r := self.*._find(idx);
|
|
253
|
+
assert(r.found, "BTreeMap: key not found");
|
|
254
|
+
entry_ptr := &(self.*._entries(r.idx));
|
|
255
|
+
&(entry_ptr.*.value)
|
|
256
|
+
})
|
|
257
|
+
));
|
|
258
|
+
|
|
249
259
|
export
|
|
250
260
|
BTreeMap,
|
|
251
261
|
BTreeMapIter,
|
package/std/collections/deque.yo
CHANGED
|
@@ -250,6 +250,16 @@ impl(forall(T : Type), Deque(T),
|
|
|
250
250
|
)
|
|
251
251
|
);
|
|
252
252
|
|
|
253
|
+
impl(forall(T : Type), Deque(T), Index(usize)(
|
|
254
|
+
Output : T,
|
|
255
|
+
index : (fn(self: *(Self), idx: usize) -> *(Self.Output))({
|
|
256
|
+
assert((idx < self.*._len), "Deque: index out of bounds");
|
|
257
|
+
buf := self.*._buf.unwrap();
|
|
258
|
+
physical_idx := ((self.*._head + idx) % self.*._capacity);
|
|
259
|
+
(buf &+ physical_idx)
|
|
260
|
+
})
|
|
261
|
+
));
|
|
262
|
+
|
|
253
263
|
export
|
|
254
264
|
Deque,
|
|
255
265
|
DequeIter,
|
|
@@ -697,6 +697,21 @@ impl(forall(K : Type, V : Type), where(K <: (Eq(K), Hash)), HashMap(K, V),
|
|
|
697
697
|
)
|
|
698
698
|
);
|
|
699
699
|
|
|
700
|
+
impl(forall(K : Type, V : Type), HashMap(K, V), Index(K)(
|
|
701
|
+
Output : V,
|
|
702
|
+
index : (fn(self: *(Self), idx: K, where(K <: (Eq(K), Hash))) -> *(Self.Output))({
|
|
703
|
+
hash := idx.hash();
|
|
704
|
+
bucket_opt := Self._find_bucket(self.*, idx, hash);
|
|
705
|
+
match(bucket_opt,
|
|
706
|
+
.Some(i) => {
|
|
707
|
+
data_ptr := Self._data_ptr(self.*);
|
|
708
|
+
&((data_ptr &+ i).*.value)
|
|
709
|
+
},
|
|
710
|
+
.None => panic("HashMap: key not found")
|
|
711
|
+
)
|
|
712
|
+
})
|
|
713
|
+
));
|
|
714
|
+
|
|
700
715
|
export
|
|
701
716
|
HashMap,
|
|
702
717
|
HashMapError,
|
|
@@ -59,8 +59,8 @@ impl(forall(T : Type), PriorityQueue(T),
|
|
|
59
59
|
parent_val := self._data.get(parent).unwrap();
|
|
60
60
|
cond(
|
|
61
61
|
(child_val < parent_val) => {
|
|
62
|
-
self._data
|
|
63
|
-
self._data
|
|
62
|
+
&(self._data(i)).* = parent_val;
|
|
63
|
+
&(self._data(parent)).* = child_val;
|
|
64
64
|
i = parent;
|
|
65
65
|
},
|
|
66
66
|
true => {
|
|
@@ -78,7 +78,7 @@ impl(forall(T : Type), PriorityQueue(T),
|
|
|
78
78
|
true => {
|
|
79
79
|
top := self._data.get(usize(0)).unwrap();
|
|
80
80
|
last := self._data.get((n - usize(1))).unwrap();
|
|
81
|
-
self._data
|
|
81
|
+
&(self._data(usize(0))).* = last;
|
|
82
82
|
self._data.pop();
|
|
83
83
|
// Sift down
|
|
84
84
|
i := usize(0);
|
|
@@ -106,8 +106,8 @@ impl(forall(T : Type), PriorityQueue(T),
|
|
|
106
106
|
true => {
|
|
107
107
|
a := self._data.get(i).unwrap();
|
|
108
108
|
b := self._data.get(smallest).unwrap();
|
|
109
|
-
self._data
|
|
110
|
-
self._data
|
|
109
|
+
&(self._data(i)).* = b;
|
|
110
|
+
&(self._data(smallest)).* = a;
|
|
111
111
|
i = smallest;
|
|
112
112
|
}
|
|
113
113
|
);
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
// HTML entity decoding
|
|
2
|
+
//
|
|
3
|
+
// Decodes named (&), decimal (&), and hex (&) HTML character references.
|
|
4
|
+
// Uses Legacy mode — entities without trailing semicolon are also decoded.
|
|
5
|
+
//
|
|
6
|
+
// Example:
|
|
7
|
+
// { decode_html } :: import "std/encoding/html";
|
|
8
|
+
//
|
|
9
|
+
// result := decode_html(`& < & &`);
|
|
10
|
+
// assert((result == `& < & &`), "decoded entities");
|
|
11
|
+
|
|
12
|
+
open import "../string";
|
|
13
|
+
{ HashMap } :: import "../collections/hash_map";
|
|
14
|
+
{ HashSet } :: import "../collections/hash_set";
|
|
15
|
+
{ is_valid_entity_code, from_code_point } :: import "./html_char_utils";
|
|
16
|
+
{ _build_entity_map, _build_legacy_set } :: import "./html_entities";
|
|
17
|
+
|
|
18
|
+
// Module-level state: lazily initialized entity map and legacy set.
|
|
19
|
+
_state_initialized := false;
|
|
20
|
+
_entity_map := HashMap(String, String).new();
|
|
21
|
+
_legacy_set := HashSet(String).new();
|
|
22
|
+
|
|
23
|
+
_ensure_init :: (fn() -> unit)({
|
|
24
|
+
if(!((_state_initialized)), {
|
|
25
|
+
_entity_map = _build_entity_map();
|
|
26
|
+
_legacy_set = _build_legacy_set();
|
|
27
|
+
_state_initialized = true;
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
// Parse a hex string to i32
|
|
32
|
+
_parse_hex :: (fn(s: String) -> i32)({
|
|
33
|
+
(result : i32) = i32(0);
|
|
34
|
+
(i : usize) = usize(0);
|
|
35
|
+
while ((i < s.len())), {
|
|
36
|
+
c := s.at(i).unwrap();
|
|
37
|
+
result = (result * i32(16));
|
|
38
|
+
if(((c >= rune(u32('0'))) && (c <= rune(u32('9')))), {
|
|
39
|
+
result = (result + (i32(c.to_u32()) - i32(48)));
|
|
40
|
+
}, if(((c >= rune(u32('a'))) && (c <= rune(u32('f')))), {
|
|
41
|
+
result = (result + ((i32(c.to_u32()) - i32(97)) + i32(10)));
|
|
42
|
+
}, if(((c >= rune(u32('A'))) && (c <= rune(u32('F')))), {
|
|
43
|
+
result = (result + ((i32(c.to_u32()) - i32(65)) + i32(10)));
|
|
44
|
+
})));
|
|
45
|
+
i = (i + usize(1));
|
|
46
|
+
};
|
|
47
|
+
result
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
// Parse a decimal string to i32
|
|
51
|
+
_parse_dec :: (fn(s: String) -> i32)({
|
|
52
|
+
(result : i32) = i32(0);
|
|
53
|
+
(i : usize) = usize(0);
|
|
54
|
+
while ((i < s.len())), {
|
|
55
|
+
c := s.at(i).unwrap();
|
|
56
|
+
result = ((result * i32(10)) + (i32(c.to_u32()) - i32(48)));
|
|
57
|
+
i = (i + usize(1));
|
|
58
|
+
};
|
|
59
|
+
result
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// Check if a character is an ASCII alphanumeric
|
|
63
|
+
_is_alpha_numeric :: (fn(c: rune) -> bool)(
|
|
64
|
+
((((c >= rune(u32('a'))) && (c <= rune(u32('z')))) || ((c >= rune(u32('A'))) && (c <= rune(u32('Z'))))) || ((c >= rune(u32('0'))) && (c <= rune(u32('9')))))
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
// Decode HTML entities in a string (Legacy mode — entities without ; are also decoded).
|
|
68
|
+
decode_html :: (fn(input: String) -> String)({
|
|
69
|
+
_ensure_init();
|
|
70
|
+
|
|
71
|
+
(len : usize) = input.len();
|
|
72
|
+
if(((len == usize(0))), {
|
|
73
|
+
return input;
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
// Quick check: if no '&', return as-is
|
|
77
|
+
if(!(input.contains(`&`)), {
|
|
78
|
+
return input;
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
(result : String) = ``;
|
|
82
|
+
(i : usize) = usize(0);
|
|
83
|
+
|
|
84
|
+
while ((i < len)), {
|
|
85
|
+
c := input.at(i).unwrap();
|
|
86
|
+
|
|
87
|
+
if((c != rune(u32('&'))), {
|
|
88
|
+
// Not an entity start, just append the character
|
|
89
|
+
result = `${result}${from_code_point(i32(c.to_u32()))}`;
|
|
90
|
+
i = (i + usize(1));
|
|
91
|
+
}, {
|
|
92
|
+
// Found '&' — try to decode entity
|
|
93
|
+
(start : usize) = i;
|
|
94
|
+
i = (i + usize(1));
|
|
95
|
+
|
|
96
|
+
if(((i >= len)), {
|
|
97
|
+
result = `${result}&`;
|
|
98
|
+
}, {
|
|
99
|
+
next := input.at(i).unwrap();
|
|
100
|
+
|
|
101
|
+
if(((next == rune(u32('#')))), {
|
|
102
|
+
// Numeric entity: &#N; or &#xN;
|
|
103
|
+
i = (i + usize(1));
|
|
104
|
+
if(((i >= len)), {
|
|
105
|
+
result = `${result}&#`;
|
|
106
|
+
}, {
|
|
107
|
+
hex_char := input.at(i).unwrap();
|
|
108
|
+
if(((hex_char == rune(u32('x'))) || (hex_char == rune(u32('X')))), {
|
|
109
|
+
// Hex: &#xHH;
|
|
110
|
+
(digit_start : usize) = (i + usize(1));
|
|
111
|
+
(digit_end : usize) = digit_start;
|
|
112
|
+
while (((digit_end < len))), {
|
|
113
|
+
dc := input.at(digit_end).unwrap();
|
|
114
|
+
if(((((dc >= rune(u32('0'))) && (dc <= rune(u32('9')))) || (((dc >= rune(u32('a'))) && (dc <= rune(u32('f')))) || ((dc >= rune(u32('A'))) && (dc <= rune(u32('F'))))))), {
|
|
115
|
+
digit_end = (digit_end + usize(1));
|
|
116
|
+
}, {
|
|
117
|
+
// Done with hex digits, break out
|
|
118
|
+
digit_end = (digit_end + usize(0));
|
|
119
|
+
// Use a flag to break
|
|
120
|
+
return_early := true;
|
|
121
|
+
// TODO: proper break
|
|
122
|
+
digit_end = (len + usize(1));
|
|
123
|
+
});
|
|
124
|
+
};
|
|
125
|
+
// Fix digit_end if it overflowed
|
|
126
|
+
if(((digit_end > len)), {
|
|
127
|
+
// We used the overflow trick - find actual end
|
|
128
|
+
digit_end = digit_start;
|
|
129
|
+
while ((digit_end < len)), {
|
|
130
|
+
dc2 := input.at(digit_end).unwrap();
|
|
131
|
+
if(((((dc2 >= rune(u32('0'))) && (dc2 <= rune(u32('9')))) || (((dc2 >= rune(u32('a'))) && (dc2 <= rune(u32('f')))) || ((dc2 >= rune(u32('A'))) && (dc2 <= rune(u32('F'))))))), {
|
|
132
|
+
digit_end = (digit_end + usize(1));
|
|
133
|
+
}, {
|
|
134
|
+
digit_end = ((len + digit_end) + usize(1));
|
|
135
|
+
});
|
|
136
|
+
};
|
|
137
|
+
if(((digit_end > len)), {
|
|
138
|
+
digit_end = ((digit_end - len) - usize(1));
|
|
139
|
+
});
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
if(((digit_end > digit_start)), {
|
|
143
|
+
hex_str := input.substring(digit_start, digit_end);
|
|
144
|
+
(code : i32) = _parse_hex(hex_str);
|
|
145
|
+
|
|
146
|
+
// Check for semicolon
|
|
147
|
+
if((((digit_end < len) && (input.at(digit_end).unwrap() == rune(u32(';'))))), {
|
|
148
|
+
i = (digit_end + usize(1));
|
|
149
|
+
}, {
|
|
150
|
+
i = digit_end;
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
if(is_valid_entity_code(code), {
|
|
154
|
+
result = `${result}${from_code_point(code)}`;
|
|
155
|
+
}, {
|
|
156
|
+
// Invalid code (e.g., surrogates) — keep original entity text
|
|
157
|
+
(orig_hex : String) = input.substring(start, i);
|
|
158
|
+
result = `${result}${orig_hex}`;
|
|
159
|
+
});
|
|
160
|
+
}, {
|
|
161
|
+
// No hex digits — output literally
|
|
162
|
+
result = `${result}&#${from_code_point(i32(hex_char.to_u32()))}`;
|
|
163
|
+
i = (i + usize(1));
|
|
164
|
+
});
|
|
165
|
+
}, {
|
|
166
|
+
// Decimal: &#DD;
|
|
167
|
+
(digit_start : usize) = i;
|
|
168
|
+
(digit_end : usize) = digit_start;
|
|
169
|
+
while ((digit_end < len)), {
|
|
170
|
+
dc := input.at(digit_end).unwrap();
|
|
171
|
+
if((((dc >= rune(u32('0'))) && (dc <= rune(u32('9'))))), {
|
|
172
|
+
digit_end = (digit_end + usize(1));
|
|
173
|
+
}, {
|
|
174
|
+
digit_end = ((len + digit_end) + usize(1));
|
|
175
|
+
});
|
|
176
|
+
};
|
|
177
|
+
if(((digit_end > len)), {
|
|
178
|
+
digit_end = ((digit_end - len) - usize(1));
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
if(((digit_end > digit_start)), {
|
|
182
|
+
dec_str := input.substring(digit_start, digit_end);
|
|
183
|
+
(code : i32) = _parse_dec(dec_str);
|
|
184
|
+
|
|
185
|
+
// Check for semicolon
|
|
186
|
+
if((((digit_end < len) && (input.at(digit_end).unwrap() == rune(u32(';'))))), {
|
|
187
|
+
i = (digit_end + usize(1));
|
|
188
|
+
}, {
|
|
189
|
+
i = digit_end;
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
if(is_valid_entity_code(code), {
|
|
193
|
+
result = `${result}${from_code_point(code)}`;
|
|
194
|
+
}, {
|
|
195
|
+
// Invalid code (e.g., surrogates) — keep original entity text
|
|
196
|
+
(orig_dec : String) = input.substring(start, i);
|
|
197
|
+
result = `${result}${orig_dec}`;
|
|
198
|
+
});
|
|
199
|
+
}, {
|
|
200
|
+
// No decimal digits
|
|
201
|
+
result = `${result}&#`;
|
|
202
|
+
});
|
|
203
|
+
});
|
|
204
|
+
});
|
|
205
|
+
}, {
|
|
206
|
+
// Named entity: &name; or &name (legacy)
|
|
207
|
+
(name_start : usize) = i;
|
|
208
|
+
(name_end : usize) = name_start;
|
|
209
|
+
while ((name_end < len)), {
|
|
210
|
+
nc := input.at(name_end).unwrap();
|
|
211
|
+
if(((nc == rune(u32(';')))), {
|
|
212
|
+
// Found semicolon — end of entity name
|
|
213
|
+
name_end = ((len + name_end) + usize(1));
|
|
214
|
+
}, if(_is_alpha_numeric(nc), {
|
|
215
|
+
name_end = (name_end + usize(1));
|
|
216
|
+
}, {
|
|
217
|
+
// Non-alphanumeric, non-semicolon — end of potential entity
|
|
218
|
+
name_end = ((len + name_end) + usize(1));
|
|
219
|
+
}));
|
|
220
|
+
};
|
|
221
|
+
// Decode the overflow trick
|
|
222
|
+
(found_end : bool) = false;
|
|
223
|
+
if(((name_end > len)), {
|
|
224
|
+
name_end = ((name_end - len) - usize(1));
|
|
225
|
+
found_end = true;
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
name_str := input.substring(name_start, name_end);
|
|
229
|
+
|
|
230
|
+
// Check for semicolon at name_end
|
|
231
|
+
(has_semi : bool) = (((name_end < len) && (input.at(name_end).unwrap() == rune(u32(';')))));
|
|
232
|
+
|
|
233
|
+
if(has_semi, {
|
|
234
|
+
// Try exact match with semicolon
|
|
235
|
+
match(_entity_map.get(name_str),
|
|
236
|
+
.Some(decoded) => {
|
|
237
|
+
result = `${result}${decoded}`;
|
|
238
|
+
i = (name_end + usize(1));
|
|
239
|
+
},
|
|
240
|
+
.None => {
|
|
241
|
+
// Unknown entity — output literally
|
|
242
|
+
result = `${result}&${name_str};`;
|
|
243
|
+
i = (name_end + usize(1));
|
|
244
|
+
}
|
|
245
|
+
);
|
|
246
|
+
}, {
|
|
247
|
+
// Legacy mode: try progressively shorter names
|
|
248
|
+
(matched : bool) = false;
|
|
249
|
+
(try_end : usize) = name_end;
|
|
250
|
+
|
|
251
|
+
while ((((try_end > name_start) && !(matched)))), {
|
|
252
|
+
try_name := input.substring(name_start, try_end);
|
|
253
|
+
if(_legacy_set.contains(try_name), {
|
|
254
|
+
match(_entity_map.get(try_name),
|
|
255
|
+
.Some(decoded) => {
|
|
256
|
+
result = `${result}${decoded}`;
|
|
257
|
+
i = try_end;
|
|
258
|
+
matched = true;
|
|
259
|
+
},
|
|
260
|
+
.None => {
|
|
261
|
+
try_end = (try_end - usize(1));
|
|
262
|
+
}
|
|
263
|
+
);
|
|
264
|
+
}, {
|
|
265
|
+
try_end = (try_end - usize(1));
|
|
266
|
+
});
|
|
267
|
+
};
|
|
268
|
+
|
|
269
|
+
if(!(matched), {
|
|
270
|
+
// No legacy match — output '&' literally and continue
|
|
271
|
+
result = `${result}&`;
|
|
272
|
+
i = name_start;
|
|
273
|
+
});
|
|
274
|
+
});
|
|
275
|
+
});
|
|
276
|
+
});
|
|
277
|
+
});
|
|
278
|
+
};
|
|
279
|
+
|
|
280
|
+
result
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
export decode_html, is_valid_entity_code, from_code_point;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
// HTML character utility functions
|
|
2
|
+
//
|
|
3
|
+
// Provides Unicode codepoint validation and conversion for HTML entity processing.
|
|
4
|
+
//
|
|
5
|
+
// Example:
|
|
6
|
+
// { is_valid_entity_code, from_code_point } :: import "std/encoding/html_char_utils";
|
|
7
|
+
//
|
|
8
|
+
// assert(is_valid_entity_code(i32(65)), "A is valid");
|
|
9
|
+
// s := from_code_point(i32(65)); // "A"
|
|
10
|
+
|
|
11
|
+
open import "../string";
|
|
12
|
+
|
|
13
|
+
// Check if a Unicode codepoint is a valid HTML entity value.
|
|
14
|
+
is_valid_entity_code :: (fn(c: i32) -> bool)(
|
|
15
|
+
cond(
|
|
16
|
+
((c >= i32(0xD800)) && (c <= i32(0xDFFF))) => false,
|
|
17
|
+
((c >= i32(0xFDD0)) && (c <= i32(0xFDEF))) => false,
|
|
18
|
+
(((c & i32(0xFFFF)) == i32(0xFFFF)) || ((c & i32(0xFFFF)) == i32(0xFFFE))) => false,
|
|
19
|
+
((c >= i32(0x00)) && (c <= i32(0x08))) => false,
|
|
20
|
+
(c == i32(0x0B)) => false,
|
|
21
|
+
((c >= i32(0x0E)) && (c <= i32(0x1F))) => false,
|
|
22
|
+
((c >= i32(0x7F)) && (c <= i32(0x9F))) => false,
|
|
23
|
+
(c > i32(0x10FFFF)) => false,
|
|
24
|
+
true => true
|
|
25
|
+
)
|
|
26
|
+
);
|
|
27
|
+
|
|
28
|
+
// Convert a Unicode codepoint to a String.
|
|
29
|
+
from_code_point :: (fn(c: i32) -> String)(
|
|
30
|
+
{
|
|
31
|
+
(r : rune) = rune(u32(c));
|
|
32
|
+
`${r}`
|
|
33
|
+
}
|
|
34
|
+
);
|
|
35
|
+
|
|
36
|
+
export is_valid_entity_code, from_code_point;
|