discourse_ai-tokenizers 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c475199d403a36cf33668b1f64e030d2166034656e1c48760dfe3a6f18c3a394
|
|
4
|
+
data.tar.gz: 48e9daf8943ba37ad8ea14cd81c275c29352459dfd226b95b605dde403dee748
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fcf2e4fa7d73348f55bfc13a6de3eded0391fdca58d6629fc672dc6856eade5adf3f196902d6ffc5b9ef01079783c651f74fe89aad41fdf26573e866a5110120
|
|
7
|
+
data.tar.gz: 98e9dcdffa14456a090ff2f3a06189f79585d8c97c79714710a17fd62491416fe63f0c6204fd4be76c5b3348d591ab4c978fade5ae5a45870de03787d014078d
|
data/CHANGELOG.md
CHANGED
|
@@ -21,7 +21,7 @@ module DiscourseAi
|
|
|
21
21
|
end
|
|
22
22
|
|
|
23
23
|
def tokenize(text)
|
|
24
|
-
tokenizer.encode(text).tokens
|
|
24
|
+
tokenizer.encode(normalize_text(text)).tokens
|
|
25
25
|
end
|
|
26
26
|
|
|
27
27
|
def size(text)
|
|
@@ -32,38 +32,67 @@ module DiscourseAi
|
|
|
32
32
|
tokenizer.decode(token_ids)
|
|
33
33
|
end
|
|
34
34
|
|
|
35
|
-
def encode(
|
|
36
|
-
tokenizer.encode(
|
|
35
|
+
def encode(text)
|
|
36
|
+
tokenizer.encode(normalize_text(text)).ids
|
|
37
37
|
end
|
|
38
38
|
|
|
39
39
|
def truncate(text, max_length, strict: false)
|
|
40
40
|
return "" if max_length <= 0
|
|
41
41
|
|
|
42
|
+
text = normalize_text(text)
|
|
43
|
+
|
|
42
44
|
# fast track common case, /2 to handle unicode chars
|
|
43
45
|
# than can take more than 1 token per char
|
|
44
46
|
return text if !strict && text.size < max_length / 2
|
|
45
47
|
|
|
46
48
|
# Take tokens up to max_length, decode, then ensure we don't exceed limit
|
|
47
49
|
truncated_tokens = tokenizer.encode(text).ids.take(max_length)
|
|
48
|
-
truncated_text = tokenizer.decode(truncated_tokens)
|
|
50
|
+
truncated_text = normalize_text(tokenizer.decode(truncated_tokens))
|
|
49
51
|
|
|
50
52
|
# If re-encoding exceeds the limit, we need to further truncate
|
|
51
53
|
while tokenizer.encode(truncated_text).ids.length > max_length
|
|
52
54
|
truncated_tokens = truncated_tokens[0...-1]
|
|
53
|
-
truncated_text = tokenizer.decode(truncated_tokens)
|
|
55
|
+
truncated_text = normalize_text(tokenizer.decode(truncated_tokens))
|
|
54
56
|
break if truncated_tokens.empty?
|
|
55
57
|
end
|
|
56
58
|
|
|
57
|
-
truncated_text
|
|
59
|
+
normalize_text(truncated_text)
|
|
58
60
|
end
|
|
59
61
|
|
|
60
62
|
def below_limit?(text, limit, strict: false)
|
|
63
|
+
text = normalize_text(text)
|
|
64
|
+
|
|
61
65
|
# fast track common case, /2 to handle unicode chars
|
|
62
66
|
# than can take more than 1 token per char
|
|
63
67
|
return true if !strict && text.size < limit / 2
|
|
64
68
|
|
|
65
69
|
tokenizer.encode(text).ids.length < limit
|
|
66
70
|
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
def normalize_text(text)
|
|
75
|
+
return text unless text.is_a?(String)
|
|
76
|
+
|
|
77
|
+
# Fast path: avoid allocations for the common valid UTF-8 case.
|
|
78
|
+
if text.encoding == Encoding::UTF_8 && text.valid_encoding?
|
|
79
|
+
return text
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
if text.encoding == Encoding::ASCII_8BIT
|
|
83
|
+
normalized = text.dup
|
|
84
|
+
normalized.force_encoding(Encoding::UTF_8)
|
|
85
|
+
elsif text.encoding != Encoding::UTF_8
|
|
86
|
+
normalized = text.encode(Encoding::UTF_8)
|
|
87
|
+
else
|
|
88
|
+
normalized = text
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
normalized.valid_encoding? ? normalized : normalized.scrub
|
|
92
|
+
rescue Encoding::UndefinedConversionError,
|
|
93
|
+
Encoding::InvalidByteSequenceError
|
|
94
|
+
text.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
|
|
95
|
+
end
|
|
67
96
|
end
|
|
68
97
|
end
|
|
69
98
|
end
|
|
@@ -54,25 +54,29 @@ module DiscourseAi
|
|
|
54
54
|
def truncate(text, max_length, strict: false)
|
|
55
55
|
return "" if max_length <= 0
|
|
56
56
|
|
|
57
|
+
text = normalize_text(text)
|
|
58
|
+
|
|
57
59
|
# fast track common case, /2 to handle unicode chars
|
|
58
60
|
# than can take more than 1 token per char
|
|
59
61
|
return text if !strict && text.size < max_length / 2
|
|
60
62
|
|
|
61
63
|
# Take tokens up to max_length, decode, then ensure we don't exceed limit
|
|
62
64
|
truncated_tokens = tokenize(text).take(max_length)
|
|
63
|
-
truncated_text = decode(truncated_tokens)
|
|
65
|
+
truncated_text = normalize_text(decode(truncated_tokens))
|
|
64
66
|
|
|
65
67
|
# If re-encoding exceeds the limit, we need to further truncate
|
|
66
68
|
while tokenize(truncated_text).length > max_length
|
|
67
69
|
truncated_tokens = truncated_tokens[0...-1]
|
|
68
|
-
truncated_text = decode(truncated_tokens)
|
|
70
|
+
truncated_text = normalize_text(decode(truncated_tokens))
|
|
69
71
|
break if truncated_tokens.empty?
|
|
70
72
|
end
|
|
71
73
|
|
|
72
|
-
truncated_text
|
|
74
|
+
normalize_text(truncated_text)
|
|
73
75
|
end
|
|
74
76
|
|
|
75
77
|
def below_limit?(text, limit, strict: false)
|
|
78
|
+
text = normalize_text(text)
|
|
79
|
+
|
|
76
80
|
# fast track common case, /2 to handle unicode chars
|
|
77
81
|
# than can take more than 1 token per char
|
|
78
82
|
return true if !strict && text.size < limit / 2
|
|
@@ -83,6 +87,8 @@ module DiscourseAi
|
|
|
83
87
|
private
|
|
84
88
|
|
|
85
89
|
def safe_encode(text)
|
|
90
|
+
text = normalize_text(text)
|
|
91
|
+
|
|
86
92
|
if !text.is_a?(String) || text.size <= SAFE_CHUNK_SIZE
|
|
87
93
|
return tokenizer.encode(text)
|
|
88
94
|
end
|