@ingglish/normalize 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,258 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/index.ts
21
+ var index_exports = {};
22
+ __export(index_exports, {
23
+ WORD_SPLIT_REGEX: () => WORD_SPLIT_REGEX,
24
+ WORD_TEST_REGEX: () => WORD_TEST_REGEX,
25
+ applyCasePattern: () => applyCasePattern,
26
+ detectCasePattern: () => detectCasePattern,
27
+ extractPreservedPatterns: () => extractPreservedPatterns,
28
+ normalizeApostrophes: () => normalizeApostrophes,
29
+ splitCamelCase: () => splitCamelCase,
30
+ stripDiacritics: () => stripDiacritics,
31
+ tokenizeIPA: () => tokenizeIPA,
32
+ tokenizeText: () => tokenizeText
33
+ });
34
+ module.exports = __toCommonJS(index_exports);
35
+
36
+ // src/case.ts
37
+ function applyCasePattern(word, pattern, original) {
38
+ if (pattern === "lower") {
39
+ const firstChar = word.codePointAt(0);
40
+ if (firstChar >= 97 && firstChar <= 122 && word === word.toLowerCase()) {
41
+ return word;
42
+ }
43
+ return word.toLowerCase();
44
+ }
45
+ switch (pattern) {
46
+ case "capitalized": {
47
+ return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
48
+ }
49
+ case "mixed": {
50
+ if (original !== void 0 && original.length > 0) {
51
+ return applyMixedCase(word, original);
52
+ }
53
+ return word.toLowerCase();
54
+ }
55
+ case "upper": {
56
+ return word.toUpperCase();
57
+ }
58
+ default: {
59
+ return word.toLowerCase();
60
+ }
61
+ }
62
+ }
63
+ function detectCasePattern(word) {
64
+ if (word.length === 0) {
65
+ return "lower";
66
+ }
67
+ const firstCode = word.codePointAt(0);
68
+ const firstChar = word[0];
69
+ const isFirstUpper = firstCode >= 65 && firstCode <= 90 || // A-Z (fast ASCII path)
70
+ firstCode > 127 && firstChar !== firstChar.toLowerCase();
71
+ if (!isFirstUpper && word === word.toLowerCase()) {
72
+ return "lower";
73
+ }
74
+ if (word.length === 1) {
75
+ if (word === "I") {
76
+ return "lower";
77
+ }
78
+ return isFirstUpper ? "capitalized" : "lower";
79
+ }
80
+ if (word === word.toUpperCase()) {
81
+ if (word.length <= 2) {
82
+ return "capitalized";
83
+ }
84
+ return "upper";
85
+ }
86
+ if (isFirstUpper) {
87
+ let restIsLower = true;
88
+ for (let i = 1; i < word.length; i++) {
89
+ const c = word.codePointAt(i);
90
+ if (c >= 65 && c <= 90 || // A-Z
91
+ c > 127 && word[i] !== word[i].toLowerCase()) {
92
+ restIsLower = false;
93
+ break;
94
+ }
95
+ }
96
+ if (restIsLower) {
97
+ return "capitalized";
98
+ }
99
+ }
100
+ return "mixed";
101
+ }
102
+ function splitCamelCase(word) {
103
+ if (word.length < 2) {
104
+ return null;
105
+ }
106
+ let hasInternalUpper = false;
107
+ for (let i = 1; i < word.length; i++) {
108
+ const c = word.codePointAt(i);
109
+ if (c >= 65 && c <= 90) {
110
+ hasInternalUpper = true;
111
+ break;
112
+ }
113
+ }
114
+ if (!hasInternalUpper) {
115
+ return null;
116
+ }
117
+ const parts = [];
118
+ let start = 0;
119
+ for (let i = 1; i < word.length; i++) {
120
+ const prevCode = word.codePointAt(i - 1);
121
+ const currCode = word.codePointAt(i);
122
+ if (prevCode >= 97 && prevCode <= 122 && currCode >= 65 && currCode <= 90) {
123
+ parts.push(word.slice(start, i));
124
+ start = i;
125
+ }
126
+ }
127
+ if (parts.length > 0) {
128
+ parts.push(word.slice(start));
129
+ return parts;
130
+ }
131
+ return null;
132
+ }
133
+ function applyMixedCase(translated, original) {
134
+ const lowerTranslated = translated.toLowerCase();
135
+ let result = "";
136
+ let i = 0;
137
+ for (const char of lowerTranslated) {
138
+ if (i < original.length) {
139
+ const origChar = original[i];
140
+ result += origChar === origChar.toUpperCase() ? char.toUpperCase() : char;
141
+ } else {
142
+ result += char;
143
+ }
144
+ i++;
145
+ }
146
+ return result;
147
+ }
148
+
149
+ // src/text.ts
150
+ var URL_REGEX = /(?:https?|ftp|file):\/\/[^\s<>"')\]]+/gi;
151
+ var EMAIL_REGEX = /[\w.%+-]{1,64}@(?:[a-z0-9-]{1,63}\.){1,10}[a-z]{2,63}/gi;
152
+ var COMMON_TLDS = "com|org|net|edu|gov|io|co|uk|de|fr|jp|au|ca|ru|ch|it|nl|se|no|es|mil|info|biz|tv|me|app|dev|ai|xyz|rs|site|tech|blog|news|club|lol|new|world|online|space|fun|live|shop|store|gg|fm|im|is|to|cc|ws|ly|gl|be|us|in|eu|asia|pro|cz|pl|fi|dk|pt|ie|nz|za|br|mx|ar|cl|kr|tw|hk|sg|id|th|vn|ph";
153
+ var BARE_DOMAIN_REGEX = new RegExp(
154
+ String.raw`\b(?:[a-z0-9][-a-z0-9]*\.)+(?:${COMMON_TLDS})\b(?:\/[^\s<>"')\]]*)?`,
155
+ "gi"
156
+ );
157
+ function extractPreservedPatterns(text) {
158
+ const preserved = /* @__PURE__ */ new Map();
159
+ let counter = 0;
160
+ let result = text;
161
+ if (text.includes("://")) {
162
+ result = result.replaceAll(URL_REGEX, (match) => {
163
+ const placeholder = `\0${counter++}\0`;
164
+ preserved.set(placeholder, match);
165
+ return placeholder;
166
+ });
167
+ }
168
+ if (result.includes("@")) {
169
+ result = result.replaceAll(EMAIL_REGEX, (match) => {
170
+ const placeholder = `\0${counter++}\0`;
171
+ preserved.set(placeholder, match);
172
+ return placeholder;
173
+ });
174
+ }
175
+ if (result.includes(".")) {
176
+ result = result.replaceAll(BARE_DOMAIN_REGEX, (match) => {
177
+ const placeholder = `\0${counter++}\0`;
178
+ preserved.set(placeholder, match);
179
+ return placeholder;
180
+ });
181
+ }
182
+ return { preserved, text: result };
183
+ }
184
+ var FANCY_APOSTROPHE = /[\u2018\u2019\u02BC]/g;
185
+ var COMBINING_MARKS = /[\u0300-\u036F]/g;
186
+ function normalizeApostrophes(text) {
187
+ return text.replaceAll(FANCY_APOSTROPHE, "'");
188
+ }
189
+ function stripDiacritics(text) {
190
+ return text.normalize("NFD").replaceAll(COMBINING_MARKS, "");
191
+ }
192
+
193
+ // src/tokenize.ts
194
+ var WORD_SPLIT_REGEX = /((?<!\d)[a-zA-Z\u00C0-\u024F']+(?!\d))/;
195
+ var WORD_TEST_REGEX = /^[a-zA-Z\u00C0-\u024F']+$/;
196
+ var IPA_SYMBOLS_SET = new Set("\u0259\u025D\u025A\u028C\xE6\u0251\u0254\u025B\u026A\u028A\xF0\u03B8\u0283\u0292\u014B\u0279\u0261");
197
+ function isIPAChar(char) {
198
+ const code = char.codePointAt(0);
199
+ if (code >= 65 && code <= 90 || code >= 97 && code <= 122) {
200
+ return true;
201
+ }
202
+ if (code === 8288) {
203
+ return true;
204
+ }
205
+ if (code === 712 || code === 716) {
206
+ return true;
207
+ }
208
+ if (IPA_SYMBOLS_SET.has(char)) {
209
+ return true;
210
+ }
211
+ return false;
212
+ }
213
+ function tokenizeIPA(text) {
214
+ const tokens = [];
215
+ let i = 0;
216
+ while (i < text.length) {
217
+ if (isIPAChar(text[i])) {
218
+ let wordEnd = i + 1;
219
+ while (wordEnd < text.length && isIPAChar(text[wordEnd])) {
220
+ wordEnd++;
221
+ }
222
+ tokens.push({ isWord: true, text: text.slice(i, wordEnd) });
223
+ i = wordEnd;
224
+ } else {
225
+ let nonWordEnd = i + 1;
226
+ while (nonWordEnd < text.length && !isIPAChar(text[nonWordEnd])) {
227
+ nonWordEnd++;
228
+ }
229
+ tokens.push({ isWord: false, text: text.slice(i, nonWordEnd) });
230
+ i = nonWordEnd;
231
+ }
232
+ }
233
+ return tokens;
234
+ }
235
+ function tokenizeText(text) {
236
+ const normalized = normalizeApostrophes(text);
237
+ const parts = normalized.split(WORD_SPLIT_REGEX);
238
+ const tokens = [];
239
+ for (const part of parts) {
240
+ if (part.length > 0) {
241
+ tokens.push({ isWord: WORD_TEST_REGEX.test(part), text: part });
242
+ }
243
+ }
244
+ return tokens;
245
+ }
246
+ // Annotate the CommonJS export names for ESM import in node:
247
+ 0 && (module.exports = {
248
+ WORD_SPLIT_REGEX,
249
+ WORD_TEST_REGEX,
250
+ applyCasePattern,
251
+ detectCasePattern,
252
+ extractPreservedPatterns,
253
+ normalizeApostrophes,
254
+ splitCamelCase,
255
+ stripDiacritics,
256
+ tokenizeIPA,
257
+ tokenizeText
258
+ });
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Utilities for preserving case patterns during translation.
3
+ */
4
+ /**
5
+ * Case pattern detected in a word.
6
+ * - 'lower': all lowercase (hello)
7
+ * - 'upper': all uppercase (HELLO)
8
+ * - 'capitalized': first letter uppercase (Hello)
9
+ * - 'mixed': mixed case like camelCase (GitHub, iPhone)
10
+ */
11
+ type CasePattern = 'capitalized' | 'lower' | 'mixed' | 'upper';
12
+ /**
13
+ * Applies a case pattern to a word, optionally using original word for mixed case.
14
+ * Optimized: lowercase case returns word directly if already lowercase.
15
+ */
16
+ declare function applyCasePattern(word: string, pattern: CasePattern, original?: string): string;
17
+ /**
18
+ * Detects the case pattern of a word.
19
+ * Optimized for the common case (lowercase) by checking it first.
20
+ */
21
+ declare function detectCasePattern(word: string): CasePattern;
22
+ /**
23
+ * Splits a word at camelCase boundaries.
24
+ * e.g., "iCloud" -> ["i", "Cloud"], "MacBook" -> ["Mac", "Book"]
25
+ */
26
+ declare function splitCamelCase(word: string): null | string[];
27
+
28
+ /**
29
+ * Extracts URLs and emails from text, replacing them with placeholders.
30
+ * Returns the modified text and a map to restore originals.
31
+ * Placeholders use non-alphanumeric characters to avoid being split by word regex.
32
+ */
33
+ declare function extractPreservedPatterns(text: string): {
34
+ preserved: Map<string, string>;
35
+ text: string;
36
+ };
37
+ /**
38
+ * Normalizes various apostrophe characters to the standard straight apostrophe.
39
+ * Handles: ' (U+2019 right single quotation mark), ' (U+2018 left), ʼ (U+02BC modifier letter)
40
+ */
41
+ declare function normalizeApostrophes(text: string): string;
42
+ /**
43
+ * Strips diacritics/accents from text, preserving base letters.
44
+ * Converts résumé→resume, naïve→naive, cliché→cliche, café→cafe.
45
+ * Uses Unicode NFD decomposition to separate base letters from combining marks.
46
+ */
47
+ declare function stripDiacritics(text: string): string;
48
+
49
+ /**
50
+ * Tokenization utilities for splitting text into word and non-word tokens.
51
+ */
52
+ /** Regex to split text into word and non-word tokens (includes accented Latin chars).
53
+ * Digit lookaround prevents matching letters in escape sequences like \u2014. */
54
+ declare const WORD_SPLIT_REGEX: RegExp;
55
+ /** Regex to test if a token is a word (includes accented Latin chars) */
56
+ declare const WORD_TEST_REGEX: RegExp;
57
+ /**
58
+ * Token with text content and word/non-word classification.
59
+ */
60
+ interface TextToken {
61
+ isWord: boolean;
62
+ text: string;
63
+ }
64
+ /**
65
+ * Tokenizes IPA text into words and non-words (punctuation/whitespace).
66
+ */
67
+ declare function tokenizeIPA(text: string): TextToken[];
68
+ /**
69
+ * Tokenizes Ingglish/English text into words and non-words.
70
+ * Words are sequences of letters and apostrophes.
71
+ */
72
+ declare function tokenizeText(text: string): TextToken[];
73
+
74
+ export { type CasePattern, WORD_SPLIT_REGEX, WORD_TEST_REGEX, applyCasePattern, detectCasePattern, extractPreservedPatterns, normalizeApostrophes, splitCamelCase, stripDiacritics, tokenizeIPA, tokenizeText };
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Utilities for preserving case patterns during translation.
3
+ */
4
+ /**
5
+ * Case pattern detected in a word.
6
+ * - 'lower': all lowercase (hello)
7
+ * - 'upper': all uppercase (HELLO)
8
+ * - 'capitalized': first letter uppercase (Hello)
9
+ * - 'mixed': mixed case like camelCase (GitHub, iPhone)
10
+ */
11
+ type CasePattern = 'capitalized' | 'lower' | 'mixed' | 'upper';
12
+ /**
13
+ * Applies a case pattern to a word, optionally using original word for mixed case.
14
+ * Optimized: lowercase case returns word directly if already lowercase.
15
+ */
16
+ declare function applyCasePattern(word: string, pattern: CasePattern, original?: string): string;
17
+ /**
18
+ * Detects the case pattern of a word.
19
+ * Optimized for the common case (lowercase) by checking it first.
20
+ */
21
+ declare function detectCasePattern(word: string): CasePattern;
22
+ /**
23
+ * Splits a word at camelCase boundaries.
24
+ * e.g., "iCloud" -> ["i", "Cloud"], "MacBook" -> ["Mac", "Book"]
25
+ */
26
+ declare function splitCamelCase(word: string): null | string[];
27
+
28
+ /**
29
+ * Extracts URLs and emails from text, replacing them with placeholders.
30
+ * Returns the modified text and a map to restore originals.
31
+ * Placeholders use non-alphanumeric characters to avoid being split by word regex.
32
+ */
33
+ declare function extractPreservedPatterns(text: string): {
34
+ preserved: Map<string, string>;
35
+ text: string;
36
+ };
37
+ /**
38
+ * Normalizes various apostrophe characters to the standard straight apostrophe.
39
+ * Handles: ' (U+2019 right single quotation mark), ' (U+2018 left), ʼ (U+02BC modifier letter)
40
+ */
41
+ declare function normalizeApostrophes(text: string): string;
42
+ /**
43
+ * Strips diacritics/accents from text, preserving base letters.
44
+ * Converts résumé→resume, naïve→naive, cliché→cliche, café→cafe.
45
+ * Uses Unicode NFD decomposition to separate base letters from combining marks.
46
+ */
47
+ declare function stripDiacritics(text: string): string;
48
+
49
+ /**
50
+ * Tokenization utilities for splitting text into word and non-word tokens.
51
+ */
52
+ /** Regex to split text into word and non-word tokens (includes accented Latin chars).
53
+ * Digit lookaround prevents matching letters in escape sequences like \u2014. */
54
+ declare const WORD_SPLIT_REGEX: RegExp;
55
+ /** Regex to test if a token is a word (includes accented Latin chars) */
56
+ declare const WORD_TEST_REGEX: RegExp;
57
+ /**
58
+ * Token with text content and word/non-word classification.
59
+ */
60
+ interface TextToken {
61
+ isWord: boolean;
62
+ text: string;
63
+ }
64
+ /**
65
+ * Tokenizes IPA text into words and non-words (punctuation/whitespace).
66
+ */
67
+ declare function tokenizeIPA(text: string): TextToken[];
68
+ /**
69
+ * Tokenizes Ingglish/English text into words and non-words.
70
+ * Words are sequences of letters and apostrophes.
71
+ */
72
+ declare function tokenizeText(text: string): TextToken[];
73
+
74
+ export { type CasePattern, WORD_SPLIT_REGEX, WORD_TEST_REGEX, applyCasePattern, detectCasePattern, extractPreservedPatterns, normalizeApostrophes, splitCamelCase, stripDiacritics, tokenizeIPA, tokenizeText };
package/dist/index.js ADDED
@@ -0,0 +1,222 @@
1
+ // src/case.ts
2
+ function applyCasePattern(word, pattern, original) {
3
+ if (pattern === "lower") {
4
+ const firstChar = word.codePointAt(0);
5
+ if (firstChar >= 97 && firstChar <= 122 && word === word.toLowerCase()) {
6
+ return word;
7
+ }
8
+ return word.toLowerCase();
9
+ }
10
+ switch (pattern) {
11
+ case "capitalized": {
12
+ return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
13
+ }
14
+ case "mixed": {
15
+ if (original !== void 0 && original.length > 0) {
16
+ return applyMixedCase(word, original);
17
+ }
18
+ return word.toLowerCase();
19
+ }
20
+ case "upper": {
21
+ return word.toUpperCase();
22
+ }
23
+ default: {
24
+ return word.toLowerCase();
25
+ }
26
+ }
27
+ }
28
+ function detectCasePattern(word) {
29
+ if (word.length === 0) {
30
+ return "lower";
31
+ }
32
+ const firstCode = word.codePointAt(0);
33
+ const firstChar = word[0];
34
+ const isFirstUpper = firstCode >= 65 && firstCode <= 90 || // A-Z (fast ASCII path)
35
+ firstCode > 127 && firstChar !== firstChar.toLowerCase();
36
+ if (!isFirstUpper && word === word.toLowerCase()) {
37
+ return "lower";
38
+ }
39
+ if (word.length === 1) {
40
+ if (word === "I") {
41
+ return "lower";
42
+ }
43
+ return isFirstUpper ? "capitalized" : "lower";
44
+ }
45
+ if (word === word.toUpperCase()) {
46
+ if (word.length <= 2) {
47
+ return "capitalized";
48
+ }
49
+ return "upper";
50
+ }
51
+ if (isFirstUpper) {
52
+ let restIsLower = true;
53
+ for (let i = 1; i < word.length; i++) {
54
+ const c = word.codePointAt(i);
55
+ if (c >= 65 && c <= 90 || // A-Z
56
+ c > 127 && word[i] !== word[i].toLowerCase()) {
57
+ restIsLower = false;
58
+ break;
59
+ }
60
+ }
61
+ if (restIsLower) {
62
+ return "capitalized";
63
+ }
64
+ }
65
+ return "mixed";
66
+ }
67
+ function splitCamelCase(word) {
68
+ if (word.length < 2) {
69
+ return null;
70
+ }
71
+ let hasInternalUpper = false;
72
+ for (let i = 1; i < word.length; i++) {
73
+ const c = word.codePointAt(i);
74
+ if (c >= 65 && c <= 90) {
75
+ hasInternalUpper = true;
76
+ break;
77
+ }
78
+ }
79
+ if (!hasInternalUpper) {
80
+ return null;
81
+ }
82
+ const parts = [];
83
+ let start = 0;
84
+ for (let i = 1; i < word.length; i++) {
85
+ const prevCode = word.codePointAt(i - 1);
86
+ const currCode = word.codePointAt(i);
87
+ if (prevCode >= 97 && prevCode <= 122 && currCode >= 65 && currCode <= 90) {
88
+ parts.push(word.slice(start, i));
89
+ start = i;
90
+ }
91
+ }
92
+ if (parts.length > 0) {
93
+ parts.push(word.slice(start));
94
+ return parts;
95
+ }
96
+ return null;
97
+ }
98
+ function applyMixedCase(translated, original) {
99
+ const lowerTranslated = translated.toLowerCase();
100
+ let result = "";
101
+ let i = 0;
102
+ for (const char of lowerTranslated) {
103
+ if (i < original.length) {
104
+ const origChar = original[i];
105
+ result += origChar === origChar.toUpperCase() ? char.toUpperCase() : char;
106
+ } else {
107
+ result += char;
108
+ }
109
+ i++;
110
+ }
111
+ return result;
112
+ }
113
+
114
+ // src/text.ts
115
+ var URL_REGEX = /(?:https?|ftp|file):\/\/[^\s<>"')\]]+/gi;
116
+ var EMAIL_REGEX = /[\w.%+-]{1,64}@(?:[a-z0-9-]{1,63}\.){1,10}[a-z]{2,63}/gi;
117
+ var COMMON_TLDS = "com|org|net|edu|gov|io|co|uk|de|fr|jp|au|ca|ru|ch|it|nl|se|no|es|mil|info|biz|tv|me|app|dev|ai|xyz|rs|site|tech|blog|news|club|lol|new|world|online|space|fun|live|shop|store|gg|fm|im|is|to|cc|ws|ly|gl|be|us|in|eu|asia|pro|cz|pl|fi|dk|pt|ie|nz|za|br|mx|ar|cl|kr|tw|hk|sg|id|th|vn|ph";
118
+ var BARE_DOMAIN_REGEX = new RegExp(
119
+ String.raw`\b(?:[a-z0-9][-a-z0-9]*\.)+(?:${COMMON_TLDS})\b(?:\/[^\s<>"')\]]*)?`,
120
+ "gi"
121
+ );
122
+ function extractPreservedPatterns(text) {
123
+ const preserved = /* @__PURE__ */ new Map();
124
+ let counter = 0;
125
+ let result = text;
126
+ if (text.includes("://")) {
127
+ result = result.replaceAll(URL_REGEX, (match) => {
128
+ const placeholder = `\0${counter++}\0`;
129
+ preserved.set(placeholder, match);
130
+ return placeholder;
131
+ });
132
+ }
133
+ if (result.includes("@")) {
134
+ result = result.replaceAll(EMAIL_REGEX, (match) => {
135
+ const placeholder = `\0${counter++}\0`;
136
+ preserved.set(placeholder, match);
137
+ return placeholder;
138
+ });
139
+ }
140
+ if (result.includes(".")) {
141
+ result = result.replaceAll(BARE_DOMAIN_REGEX, (match) => {
142
+ const placeholder = `\0${counter++}\0`;
143
+ preserved.set(placeholder, match);
144
+ return placeholder;
145
+ });
146
+ }
147
+ return { preserved, text: result };
148
+ }
149
+ var FANCY_APOSTROPHE = /[\u2018\u2019\u02BC]/g;
150
+ var COMBINING_MARKS = /[\u0300-\u036F]/g;
151
+ function normalizeApostrophes(text) {
152
+ return text.replaceAll(FANCY_APOSTROPHE, "'");
153
+ }
154
+ function stripDiacritics(text) {
155
+ return text.normalize("NFD").replaceAll(COMBINING_MARKS, "");
156
+ }
157
+
158
+ // src/tokenize.ts
159
+ var WORD_SPLIT_REGEX = /((?<!\d)[a-zA-Z\u00C0-\u024F']+(?!\d))/;
160
+ var WORD_TEST_REGEX = /^[a-zA-Z\u00C0-\u024F']+$/;
161
+ var IPA_SYMBOLS_SET = new Set("\u0259\u025D\u025A\u028C\xE6\u0251\u0254\u025B\u026A\u028A\xF0\u03B8\u0283\u0292\u014B\u0279\u0261");
162
+ function isIPAChar(char) {
163
+ const code = char.codePointAt(0);
164
+ if (code >= 65 && code <= 90 || code >= 97 && code <= 122) {
165
+ return true;
166
+ }
167
+ if (code === 8288) {
168
+ return true;
169
+ }
170
+ if (code === 712 || code === 716) {
171
+ return true;
172
+ }
173
+ if (IPA_SYMBOLS_SET.has(char)) {
174
+ return true;
175
+ }
176
+ return false;
177
+ }
178
+ function tokenizeIPA(text) {
179
+ const tokens = [];
180
+ let i = 0;
181
+ while (i < text.length) {
182
+ if (isIPAChar(text[i])) {
183
+ let wordEnd = i + 1;
184
+ while (wordEnd < text.length && isIPAChar(text[wordEnd])) {
185
+ wordEnd++;
186
+ }
187
+ tokens.push({ isWord: true, text: text.slice(i, wordEnd) });
188
+ i = wordEnd;
189
+ } else {
190
+ let nonWordEnd = i + 1;
191
+ while (nonWordEnd < text.length && !isIPAChar(text[nonWordEnd])) {
192
+ nonWordEnd++;
193
+ }
194
+ tokens.push({ isWord: false, text: text.slice(i, nonWordEnd) });
195
+ i = nonWordEnd;
196
+ }
197
+ }
198
+ return tokens;
199
+ }
200
+ function tokenizeText(text) {
201
+ const normalized = normalizeApostrophes(text);
202
+ const parts = normalized.split(WORD_SPLIT_REGEX);
203
+ const tokens = [];
204
+ for (const part of parts) {
205
+ if (part.length > 0) {
206
+ tokens.push({ isWord: WORD_TEST_REGEX.test(part), text: part });
207
+ }
208
+ }
209
+ return tokens;
210
+ }
211
+ export {
212
+ WORD_SPLIT_REGEX,
213
+ WORD_TEST_REGEX,
214
+ applyCasePattern,
215
+ detectCasePattern,
216
+ extractPreservedPatterns,
217
+ normalizeApostrophes,
218
+ splitCamelCase,
219
+ stripDiacritics,
220
+ tokenizeIPA,
221
+ tokenizeText
222
+ };
package/package.json ADDED
@@ -0,0 +1,47 @@
1
+ {
2
+ "name": "@ingglish/normalize",
3
+ "version": "0.1.0",
4
+ "description": "Text normalization, case handling, and URL/email preservation for Ingglish",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "source": "./src/index.ts",
12
+ "import": {
13
+ "types": "./dist/index.d.ts",
14
+ "default": "./dist/index.js"
15
+ },
16
+ "require": {
17
+ "types": "./dist/index.d.cts",
18
+ "default": "./dist/index.cjs"
19
+ }
20
+ }
21
+ },
22
+ "files": [
23
+ "dist"
24
+ ],
25
+ "sideEffects": false,
26
+ "engines": {
27
+ "node": ">=16"
28
+ },
29
+ "scripts": {
30
+ "build": "tsup",
31
+ "build:fast": "tsup src/index.ts --format esm",
32
+ "lint": "eslint --cache src",
33
+ "test": "vitest run --no-color",
34
+ "prepublishOnly": "npm run build"
35
+ },
36
+ "author": "Paul Tarjan",
37
+ "license": "MIT",
38
+ "repository": {
39
+ "type": "git",
40
+ "url": "git+https://github.com/ptarjan/ingglish.git",
41
+ "directory": "packages/normalize"
42
+ },
43
+ "homepage": "https://github.com/ptarjan/ingglish#readme",
44
+ "bugs": {
45
+ "url": "https://github.com/ptarjan/ingglish/issues"
46
+ }
47
+ }