npm - @zzzen/pyright-internal - Versions diffs - 1.2.0-dev.20260422 → 1.2.0-dev.20260426 - Mend

@zzzen/pyright-internal 1.2.0-dev.20260422 → 1.2.0-dev.20260426

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/analyzer/sourceFile.js +53 -23
package/dist/analyzer/sourceFile.js.map +1 -1
package/dist/parser/characterStream.js +26 -2
package/dist/parser/characterStream.js.map +1 -1
package/dist/parser/parser.d.ts +2 -0
package/dist/parser/parser.js +11 -7
package/dist/parser/parser.js.map +1 -1
package/dist/parser/tokenizer.d.ts +6 -2
package/dist/parser/tokenizer.js +602 -206
package/dist/parser/tokenizer.js.map +1 -1
package/dist/parser/tokenizerTypes.js +115 -39
package/dist/parser/tokenizerTypes.js.map +1 -1
package/dist/tests/benchmarks/parserBenchmark.test.d.ts +1 -0
package/dist/tests/benchmarks/parserBenchmark.test.js +220 -0
package/dist/tests/benchmarks/parserBenchmark.test.js.map +1 -0
package/dist/tests/benchmarks/tokenizerBenchmark.test.d.ts +1 -0
package/dist/tests/benchmarks/tokenizerBenchmark.test.js +236 -0
package/dist/tests/benchmarks/tokenizerBenchmark.test.js.map +1 -0
package/dist/tests/tokenizer.test.js +17 -2
package/dist/tests/tokenizer.test.js.map +1 -1
package/package.json +5 -4

package/dist/parser/tokenizer.js CHANGED Viewed

@@ -60,6 +60,53 @@ const _keywords = new Map([
     ['True', 33 /* KeywordType.True */],
 ]);
 const _softKeywords = new Set(['match', 'case', 'type']);
+// Fast-reject table: keywords are 2–9 chars long and only start with these
+// character codes. A 128-entry boolean table indexed by charCodeAt(0) rejects
+// most identifiers without touching the _keywords Map.
+const _keywordFirstCharTable = (() => {
+    const table = new Array(128).fill(false);
+    for (const kw of _keywords.keys()) {
+        const code = kw.charCodeAt(0);
+        if (code < 128) {
+            table[code] = true;
+        }
+    }
+    return table;
+})();
+const _keywordMinLen = 2;
+const _keywordMaxLen = 9; // __debug__
+// For keyword-like identifiers, compare directly against the source text slice
+// to avoid creating temporary substring objects on the keyword path.
+const _keywordEntriesByFirstChar = (() => {
+    const entriesByFirstChar = new Array(128);
+    for (const [text, type] of _keywords.entries()) {
+        const firstCharCode = text.charCodeAt(0);
+        if (firstCharCode < 128) {
+            const entries = entriesByFirstChar[firstCharCode] ?? (entriesByFirstChar[firstCharCode] = []);
+            entries.push({ text, type });
+        }
+    }
+    return entriesByFirstChar;
+})();
+function getKeywordTypeFromTextSlice(text, start, length) {
+    if (length < _keywordMinLen || length > _keywordMaxLen) {
+        return undefined;
+    }
+    const firstCharCode = text.charCodeAt(start);
+    if (firstCharCode >= 128 || !_keywordFirstCharTable[firstCharCode]) {
+        return undefined;
+    }
+    const candidates = _keywordEntriesByFirstChar[firstCharCode];
+    if (!candidates) {
+        return undefined;
+    }
+    for (const candidate of candidates) {
+        if (candidate.text.length === length && text.startsWith(candidate.text, start)) {
+            return candidate.type;
+        }
+    }
+    return undefined;
+}
 const _operatorInfo = {
     [0 /* OperatorType.Add */]: 1 /* OperatorFlags.Unary */ | 2 /* OperatorFlags.Binary */,
     [1 /* OperatorType.AddEqual */]: 4 /* OperatorFlags.Assignment */,
@@ -104,16 +151,331 @@ const _operatorInfo = {
     [41 /* OperatorType.In */]: 2 /* OperatorFlags.Binary */,
     [42 /* OperatorType.NotIn */]: 2 /* OperatorFlags.Binary */,
 };
+const _unsetSingleCharOperatorType = -1;
+const _singleCharOperatorTypeTable = (() => {
+    const table = new Int16Array(128);
+    table.fill(_unsetSingleCharOperatorType);
+    table[61 /* Char.Equal */] = 2 /* OperatorType.Assign */;
+    table[43 /* Char.Plus */] = 0 /* OperatorType.Add */;
+    table[45 /* Char.Hyphen */] = 33 /* OperatorType.Subtract */;
+    table[42 /* Char.Asterisk */] = 26 /* OperatorType.Multiply */;
+    table[47 /* Char.Slash */] = 10 /* OperatorType.Divide */;
+    table[38 /* Char.Ampersand */] = 3 /* OperatorType.BitwiseAnd */;
+    table[124 /* Char.Bar */] = 6 /* OperatorType.BitwiseOr */;
+    table[94 /* Char.Caret */] = 8 /* OperatorType.BitwiseXor */;
+    table[37 /* Char.Percent */] = 24 /* OperatorType.Mod */;
+    table[126 /* Char.Tilde */] = 5 /* OperatorType.BitwiseInvert */;
+    table[64 /* Char.At */] = 22 /* OperatorType.MatrixMultiply */;
+    table[60 /* Char.Less */] = 20 /* OperatorType.LessThan */;
+    table[62 /* Char.Greater */] = 15 /* OperatorType.GreaterThan */;
+    return table;
+})();
+const _singleCharEqualOperatorTypeTable = (() => {
+    const table = new Int16Array(128);
+    table.fill(_unsetSingleCharOperatorType);
+    table[43 /* Char.Plus */] = 1 /* OperatorType.AddEqual */;
+    table[45 /* Char.Hyphen */] = 34 /* OperatorType.SubtractEqual */;
+    table[42 /* Char.Asterisk */] = 27 /* OperatorType.MultiplyEqual */;
+    table[47 /* Char.Slash */] = 11 /* OperatorType.DivideEqual */;
+    table[38 /* Char.Ampersand */] = 4 /* OperatorType.BitwiseAndEqual */;
+    table[124 /* Char.Bar */] = 7 /* OperatorType.BitwiseOrEqual */;
+    table[94 /* Char.Caret */] = 9 /* OperatorType.BitwiseXorEqual */;
+    table[37 /* Char.Percent */] = 25 /* OperatorType.ModEqual */;
+    table[64 /* Char.At */] = 23 /* OperatorType.MatrixMultiplyEqual */;
+    return table;
+})();
+function getTwoCharKey(char1, char2) {
+    return (char1 << 8) | char2;
+}
+// Two-char operator/token tables: use Map instead of Int16Array(65536).
+// With only 5+1 entries, a Map uses ~200 bytes vs 256KB for two Int16Arrays.
+const _twoCharOperatorTypeMap = new Map([
+    [getTwoCharKey(61 /* Char.Equal */, 61 /* Char.Equal */), 12 /* OperatorType.Equals */],
+    [getTwoCharKey(33 /* Char.ExclamationMark */, 61 /* Char.Equal */), 28 /* OperatorType.NotEquals */],
+    [getTwoCharKey(60 /* Char.Less */, 61 /* Char.Equal */), 21 /* OperatorType.LessThanOrEqual */],
+    [getTwoCharKey(62 /* Char.Greater */, 61 /* Char.Equal */), 16 /* OperatorType.GreaterThanOrEqual */],
+    [getTwoCharKey(60 /* Char.Less */, 62 /* Char.Greater */), 19 /* OperatorType.LessOrGreaterThan */],
+]);
+const _twoCharSpecialTokenTypeMap = new Map([
+    [getTwoCharKey(45 /* Char.Hyphen */, 62 /* Char.Greater */), 21 /* TokenType.Arrow */],
+]);
+const _repeatedCharOperatorTypeTable = (() => {
+    const table = new Int16Array(128);
+    table.fill(_unsetSingleCharOperatorType);
+    table[42 /* Char.Asterisk */] = 29 /* OperatorType.Power */;
+    table[47 /* Char.Slash */] = 13 /* OperatorType.FloorDivide */;
+    table[60 /* Char.Less */] = 17 /* OperatorType.LeftShift */;
+    table[62 /* Char.Greater */] = 31 /* OperatorType.RightShift */;
+    return table;
+})();
+const _repeatedCharEqualOperatorTypeTable = (() => {
+    const table = new Int16Array(128);
+    table.fill(_unsetSingleCharOperatorType);
+    table[42 /* Char.Asterisk */] = 30 /* OperatorType.PowerEqual */;
+    table[47 /* Char.Slash */] = 14 /* OperatorType.FloorDivideEqual */;
+    table[60 /* Char.Less */] = 18 /* OperatorType.LeftShiftEqual */;
+    table[62 /* Char.Greater */] = 32 /* OperatorType.RightShiftEqual */;
+    return table;
+})();
 const _byteOrderMarker = 0xfeff;
 const defaultTabSize = 8;
-const magicsRegEx = /\\\s*$/;
-// The character class for type: ignore rule codes includes ':' so that
-// tool-namespaced codes such as "ty:unresolved-reference" are accepted.
-// pyright: ignore uses the original class since tool-namespaced codes
-// are not expected there.
-const typeIgnoreCommentRegEx = /((^|#)\s*)type:\s*ignore(\s*\[([\s\w:,-]*)\]|\s|$)/;
-const pyrightIgnoreCommentRegEx = /((^|#)\s*)pyright:\s*ignore(\s*\[([\s\w-,]*)\]|\s|$)/;
-const underscoreRegEx = /_/g;
+// Fast-reject table: only these ASCII chars can begin a string literal
+// (quote chars or valid string prefix chars f/r/b/u/t and their uppercase).
+// Checking this table first avoids calling _getStringPrefixLength() for the
+// vast majority of tokens (identifiers, numbers, operators, etc.).
+const _canStartString = (() => {
+    const table = new Array(128).fill(false);
+    table[39 /* Char.SingleQuote */] = true;
+    table[34 /* Char.DoubleQuote */] = true;
+    for (const ch of [102 /* Char.f */, 70 /* Char.F */, 114 /* Char.r */, 82 /* Char.R */, 98 /* Char.b */, 66 /* Char.B */, 117 /* Char.u */, 85 /* Char.U */, 116 /* Char.t */, 84 /* Char.T */]) {
+        table[ch] = true;
+    }
+    return table;
+})();
+// ASCII identifier-continue table. Indexed by char code < 128; true if the
+// char can appear inside an identifier (letter, digit, underscore).
+// Building this at module load by querying isIdentifierChar lets the tight
+// identifier-swallow loop avoid function-call overhead entirely on the common
+// ASCII path. Non-ASCII chars fall back to the generic path.
+const _asciiIdentifierContinue = (() => {
+    const table = new Array(128).fill(false);
+    for (let i = 0; i < 128; i++) {
+        if ((0, characters_1.isIdentifierChar)(i)) {
+            table[i] = true;
+        }
+    }
+    return table;
+})();
+const _asciiIdentifierStart = (() => {
+    const table = new Array(128).fill(false);
+    for (let i = 0; i < 128; i++) {
+        if ((0, characters_1.isIdentifierStartChar)(i)) {
+            table[i] = true;
+        }
+    }
+    return table;
+})();
+// Create a detached copy of a source text range without going through Buffer.
+// Each charAt() for ASCII returns a V8-cached single-char string that does not
+// reference the parent. The concatenation chain becomes a ConsString independent
+// of the source text, avoiding V8 SlicedString memory pinning.
+// ~4-9x faster than Buffer.from(str,'utf8').toString('utf8') for typical
+// Python identifier lengths (5-20 chars).
+function detachSubstring(text, start, end) {
+    let result = '';
+    for (let i = start; i < end; i++) {
+        result += text.charAt(i);
+    }
+    return result;
+}
+// Strip underscore characters from a source text range without first creating
+// an intermediate substring.
+function removeUnderscoresFromRange(text, start, end) {
+    let firstUnderscoreIndex = -1;
+    for (let i = start; i < end; i++) {
+        if (text.charCodeAt(i) === 95 /* Char.Underscore */) {
+            firstUnderscoreIndex = i;
+            break;
+        }
+    }
+    if (firstUnderscoreIndex < 0) {
+        return text.slice(start, end);
+    }
+    let result = text.slice(start, firstUnderscoreIndex);
+    for (let i = firstUnderscoreIndex + 1; i < end; i++) {
+        if (text.charCodeAt(i) !== 95 /* Char.Underscore */) {
+            result += text[i];
+        }
+    }
+    return result;
+}
+// Manual replacement for magicsRegEx = /\\\s*$/
+// Check if a range [start, end) within `text` ends with a backslash followed
+// by optional whitespace.
+function endsWithBackslashContinuation(text, start, end) {
+    let i = end - 1;
+    // Skip trailing whitespace
+    while (i >= start) {
+        const ch = text.charCodeAt(i);
+        if (ch === 32 /* Char.Space */ || ch === 9 /* Char.Tab */ || ch === 12 /* Char.FormFeed */) {
+            i--;
+        }
+        else {
+            break;
+        }
+    }
+    return i >= start && text.charCodeAt(i) === 92 /* Char.Backslash */;
+}
+// Parses a bracketed rule list starting at `pos` (which must point at '[').
+// Returns the bracket content (without brackets) and the position just past ']',
+// or undefined if the bracket is malformed (e.g. unclosed, or contains invalid chars
+// before a closing bracket is found).
+function parseIgnoreBracketContent(text, pos, rangeEnd, allowColon) {
+    pos++; // skip '['
+    const bracketStart = pos;
+    while (pos < rangeEnd && text.charCodeAt(pos) !== 93 /* Char.CloseBracket */) {
+        // Only allow valid bracket content chars: \s, \w, -, ,
+        // (plus ':' for type: ignore to support tool-namespaced codes)
+        const bc = text.charCodeAt(pos);
+        if ((bc >= 97 /* Char.a */ && bc <= 122 /* Char.z */) ||
+            (bc >= 65 /* Char.A */ && bc <= 90 /* Char.Z */) ||
+            (bc >= 48 /* Char._0 */ && bc <= 57 /* Char._9 */) ||
+            bc === 95 /* Char.Underscore */ ||
+            bc === 45 /* Char.Hyphen */ ||
+            bc === 44 /* Char.Comma */ ||
+            bc === 32 /* Char.Space */ ||
+            bc === 9 /* Char.Tab */ ||
+            (allowColon && bc === 58 /* Char.Colon */)) {
+            pos++;
+        }
+        else {
+            break;
+        }
+    }
+    if (pos < rangeEnd && text.charCodeAt(pos) === 93 /* Char.CloseBracket */) {
+        return { content: text.slice(bracketStart, pos), newPos: pos + 1 };
+    }
+    return undefined;
+}
+// Manual replacement for typeIgnoreCommentRegEx / pyrightIgnoreCommentRegEx.
+// Scans `text` within [rangeStart, rangeEnd) for `<directive>: ignore [rules]`
+// where directive is 'type' or 'pyright'.
+// Returns a match object or undefined. Returned `index` is absolute within `text`.
+function matchIgnoreDirective(text, rangeStart, rangeEnd, directive) {
+    // The directive can be preceded by optional `#` and whitespace, or
+    // appear at the start of the range with optional whitespace.
+    // type: ignore allows tool-namespaced codes (e.g. "ty:rule-name") in brackets;
+    // pyright: ignore does not.
+    const allowColonInBracket = directive === 'type';
+    let searchFrom = rangeStart;
+    while (searchFrom < rangeEnd) {
+        // Find the next occurrence of the directive keyword, bounded by
+        // rangeEnd. A bounded hand-rolled scan is important here: native
+        // String.prototype.indexOf has no end bound and, when the keyword is
+        // absent from the current comment but present elsewhere in the file,
+        // can scan well past rangeEnd — producing O(n) behavior per comment
+        // and O(n^2) overall on comment-heavy files.
+        const firstCharCode = directive.charCodeAt(0);
+        let directiveIdx = -1;
+        const scanLimit = rangeEnd - directive.length;
+        for (let i = searchFrom; i <= scanLimit; i++) {
+            if (text.charCodeAt(i) === firstCharCode) {
+                let found = true;
+                for (let d = 1; d < directive.length; d++) {
+                    if (text.charCodeAt(i + d) !== directive.charCodeAt(d)) {
+                        found = false;
+                        break;
+                    }
+                }
+                if (found) {
+                    directiveIdx = i;
+                    break;
+                }
+            }
+        }
+        if (directiveIdx < 0) {
+            return undefined;
+        }
+        // Determine the prefix: scan backward from directiveIdx to find
+        // the `#` or start-of-range, collecting whitespace.
+        let prefixStart = directiveIdx;
+        let foundAnchor = false;
+        // Walk backward over spaces/tabs
+        let j = directiveIdx - 1;
+        while (j >= rangeStart && (text.charCodeAt(j) === 32 /* Char.Space */ || text.charCodeAt(j) === 9 /* Char.Tab */)) {
+            j--;
+        }
+        if (j < rangeStart) {
+            // At start of range
+            prefixStart = rangeStart;
+            foundAnchor = true;
+        }
+        else if (text.charCodeAt(j) === 35 /* Char.Hash */) {
+            prefixStart = j;
+            foundAnchor = true;
+        }
+        if (!foundAnchor) {
+            searchFrom = directiveIdx + 1;
+            continue;
+        }
+        // After directive keyword, expect ':'
+        let pos = directiveIdx + directive.length;
+        if (pos >= rangeEnd || text.charCodeAt(pos) !== 58 /* Char.Colon */) {
+            searchFrom = directiveIdx + 1;
+            continue;
+        }
+        pos++; // skip ':'
+        // Skip optional whitespace after ':'
+        while (pos < rangeEnd && (text.charCodeAt(pos) === 32 /* Char.Space */ || text.charCodeAt(pos) === 9 /* Char.Tab */)) {
+            pos++;
+        }
+        // Expect 'ignore'
+        const ignoreStr = 'ignore';
+        if (pos + ignoreStr.length > rangeEnd) {
+            searchFrom = directiveIdx + 1;
+            continue;
+        }
+        let matched = true;
+        for (let k = 0; k < ignoreStr.length; k++) {
+            if (text.charCodeAt(pos + k) !== ignoreStr.charCodeAt(k)) {
+                matched = false;
+                break;
+            }
+        }
+        if (!matched) {
+            searchFrom = directiveIdx + 1;
+            continue;
+        }
+        pos += ignoreStr.length;
+        // After 'ignore', expect whitespace, '[', or end-of-range
+        let bracketContent;
+        if (pos >= rangeEnd) {
+            // End of range — valid
+        }
+        else {
+            const ch = text.charCodeAt(pos);
+            if (ch === 32 /* Char.Space */ || ch === 9 /* Char.Tab */) {
+                // Skip whitespace to check for optional bracket
+                while (pos < rangeEnd && (text.charCodeAt(pos) === 32 /* Char.Space */ || text.charCodeAt(pos) === 9 /* Char.Tab */)) {
+                    pos++;
+                }
+                if (pos < rangeEnd && text.charCodeAt(pos) === 91 /* Char.OpenBracket */) {
+                    const parsed = parseIgnoreBracketContent(text, pos, rangeEnd, allowColonInBracket);
+                    if (parsed === undefined) {
+                        searchFrom = directiveIdx + 1;
+                        continue;
+                    }
+                    bracketContent = parsed.content;
+                    pos = parsed.newPos;
+                }
+            }
+            else if (ch === 91 /* Char.OpenBracket */) {
+                // Bracket immediately after 'ignore'
+                const parsed = parseIgnoreBracketContent(text, pos, rangeEnd, allowColonInBracket);
+                if (parsed === undefined) {
+                    searchFrom = directiveIdx + 1;
+                    continue;
+                }
+                bracketContent = parsed.content;
+                pos = parsed.newPos;
+            }
+            else {
+                // No space, no bracket — not a valid match
+                searchFrom = directiveIdx + 1;
+                continue;
+            }
+        }
+        const prefix = text.slice(prefixStart, directiveIdx);
+        const fullMatch = text.slice(prefixStart, pos);
+        return {
+            fullMatch,
+            prefix,
+            bracketContent,
+            index: prefixStart,
+        };
+    }
+    return undefined;
+}
 var MagicsKind;
 (function (MagicsKind) {
     MagicsKind[MagicsKind["None"] = 0] = "None";
@@ -128,6 +490,10 @@ class Tokenizer {
         this._parenDepth = 0;
         this._lineRanges = [];
         this._indentAmounts = [];
+        // Cached answer to "are there any non-trivial tokens yet?" Once true it
+        // stays true, so the O(n) scan in _handleComment only runs while the token
+        // stream consists purely of NewLine / Indent tokens.
+        this._hasTokenBeforeIgnoreAll = false;
         this._typeIgnoreLines = new Map();
         this._pyrightIgnoreLines = new Map();
         this._fStringStack = [];
@@ -150,10 +516,7 @@ class Tokenizer {
         this._doubleQuoteCount = 0;
         // Assume Jupyter notebook tokenization rules?
         this._useNotebookMode = false;
-        // Intern identifier strings within a single tokenization pass. This reduces
-        // per-identifier allocations while still ensuring we don't retain substrings
-        // that reference the original source text.
-        this._identifierInternedStrings = new Map();
+        this._identifierCache = new Array(Tokenizer._identifierCacheSize);
     }
     tokenize(text, start, length, initialParenDepth = 0, useNotebookMode = false) {
         if (start === undefined) {
@@ -179,7 +542,8 @@ class Tokenizer {
         this._lineRanges = [];
         this._indentAmounts = [];
         this._useNotebookMode = useNotebookMode;
-        this._identifierInternedStrings.clear();
+        // Clear per-source identifier intern cache.
+        this._identifierCache.fill(undefined);
         const end = start + length;
         if (start === 0) {
             this._readIndentationAfterNewLine();
@@ -315,19 +679,22 @@ class Tokenizer {
     // tokens onto the token list. Returns true if the caller should advance
     // to the next character.
     _handleCharacter() {
-        // f-strings, b-strings, etc
-        const stringPrefixLength = this._getStringPrefixLength();
-        if (stringPrefixLength >= 0) {
-            let stringPrefix = '';
-            if (stringPrefixLength > 0) {
-                stringPrefix = this._cs.getText().slice(this._cs.position, this._cs.position + stringPrefixLength);
-                // Indeed a string
-                this._cs.advance(stringPrefixLength);
-            }
-            const quoteTypeFlags = this._getQuoteTypeFlags(stringPrefix);
-            if (quoteTypeFlags !== 0 /* StringTokenFlags.None */) {
-                this._handleString(quoteTypeFlags, stringPrefixLength);
-                return true;
+        // f-strings, b-strings, etc — only check if current char can start a string
+        const currentChar = this._cs.currentChar;
+        if (currentChar < 128 && _canStartString[currentChar]) {
+            const stringPrefixLength = this._getStringPrefixLength();
+            if (stringPrefixLength >= 0) {
+                let stringPrefix = '';
+                if (stringPrefixLength > 0) {
+                    stringPrefix = this._cs.getText().slice(this._cs.position, this._cs.position + stringPrefixLength);
+                    // Indeed a string
+                    this._cs.advance(stringPrefixLength);
+                }
+                const quoteTypeFlags = this._getQuoteTypeFlags(stringPrefix);
+                if (quoteTypeFlags !== 0 /* StringTokenFlags.None */) {
+                    this._handleString(quoteTypeFlags, stringPrefixLength);
+                    return true;
+                }
             }
         }
         if (this._cs.currentChar === 35 /* Char.Hash */) {
@@ -657,48 +1024,105 @@ class Tokenizer {
         }
     }
     _tryIdentifier() {
-        const swallowRemainingChars = () => {
-            while (true) {
-                if ((0, characters_1.isIdentifierChar)(this._cs.currentChar)) {
-                    this._cs.moveNext();
-                }
-                else if ((0, characters_1.isIdentifierChar)(this._cs.currentChar, this._cs.nextChar)) {
-                    this._cs.moveNext();
-                    this._cs.moveNext();
+        const cs = this._cs;
+        const text = cs.getText();
+        const textLen = text.length;
+        const start = cs.position;
+        // Fast path for ASCII identifier start. Avoids the function call and
+        // surrogate logic for the common case (Python source is overwhelmingly
+        // ASCII identifiers).
+        const firstChar = cs.currentChar;
+        let pos = start;
+        if (firstChar < 128) {
+            if (!_asciiIdentifierStart[firstChar]) {
+                // Not an identifier start and not a surrogate candidate.
+                return false;
+            }
+            pos++;
+            // Tight loop: advance while we're still in ASCII identifier chars.
+            while (pos < textLen) {
+                const ch = text.charCodeAt(pos);
+                if (ch < 128 && _asciiIdentifierContinue[ch]) {
+                    pos++;
                 }
                 else {
                     break;
                 }
             }
-        };
-        const start = this._cs.position;
-        if ((0, characters_1.isIdentifierStartChar)(this._cs.currentChar)) {
-            this._cs.moveNext();
-            swallowRemainingChars();
+            // If we hit a non-ASCII char, fall back to the generic loop to
+            // handle possible unicode identifier continue / surrogate pairs.
+            if (pos < textLen && text.charCodeAt(pos) >= 128) {
+                cs.advance(pos - start);
+                this._swallowNonAsciiIdentifierChars();
+                pos = cs.position;
+            }
+            else {
+                cs.advance(pos - start);
+            }
         }
-        else if ((0, characters_1.isIdentifierStartChar)(this._cs.currentChar, this._cs.nextChar)) {
-            this._cs.moveNext();
-            this._cs.moveNext();
-            swallowRemainingChars();
+        else {
+            // Non-ASCII start: use the generic path (supports surrogates).
+            if ((0, characters_1.isIdentifierStartChar)(firstChar)) {
+                cs.moveNext();
+            }
+            else if ((0, characters_1.isIdentifierStartChar)(firstChar, cs.nextChar)) {
+                cs.moveNext();
+                cs.moveNext();
+            }
+            else {
+                return false;
+            }
+            this._swallowNonAsciiIdentifierChars();
+            pos = cs.position;
         }
-        if (this._cs.position > start) {
-            const value = this._cs.getText().slice(start, this._cs.position);
-            const keywordType = _keywords.get(value);
+        if (pos > start) {
+            const end = pos;
+            const length = end - start;
+            const keywordType = getKeywordTypeFromTextSlice(text, start, length);
             if (keywordType !== undefined) {
-                this._tokens.push(tokenizerTypes_1.KeywordToken.create(start, this._cs.position - start, keywordType, this._getComments()));
+                this._tokens.push(tokenizerTypes_1.KeywordToken.create(start, length, keywordType, this._getComments()));
             }
             else {
-                const internedValue = this._identifierInternedStrings.get(value) ?? this._internIdentifierString(value);
-                this._tokens.push(tokenizerTypes_1.IdentifierToken.create(start, this._cs.position - start, internedValue, this._getComments()));
+                const value = this._internIdentifier(text, start, end, length);
+                this._tokens.push(tokenizerTypes_1.IdentifierToken.create(start, length, value, this._getComments()));
             }
             return true;
         }
         return false;
     }
-    _internIdentifierString(value) {
-        const clonedValue = (0, core_1.cloneStr)(value);
-        this._identifierInternedStrings.set(clonedValue, clonedValue);
-        return clonedValue;
+    // Per-tokenize identifier intern cache. Direct-mapped, so collisions
+    // simply replace the slot. Common identifiers (self, cls, True, None,
+    // str, int, dict, etc.) get deduplicated to a single string object,
+    // avoiding repeated detachSubstring allocations for the same name.
+    _internIdentifier(text, start, end, length) {
+        const firstChar = text.charCodeAt(start);
+        const lastChar = text.charCodeAt(end - 1);
+        // Hash mixes length, first and last char; multiplier values chosen
+        // to spread hits for common short identifiers across the table.
+        const hash = (firstChar * 31 + lastChar * 7 + length) & Tokenizer._identifierCacheMask;
+        const cached = this._identifierCache[hash];
+        if (cached !== undefined && cached.length === length && text.startsWith(cached, start)) {
+            return cached;
+        }
+        const value = detachSubstring(text, start, end);
+        this._identifierCache[hash] = value;
+        return value;
+    }
+    // Generic identifier-continue loop that handles unicode + surrogate pairs.
+    // Falls back to this when the fast ASCII loop encounters a non-ASCII char.
+    _swallowNonAsciiIdentifierChars() {
+        while (true) {
+            if ((0, characters_1.isIdentifierChar)(this._cs.currentChar)) {
+                this._cs.moveNext();
+            }
+            else if ((0, characters_1.isIdentifierChar)(this._cs.currentChar, this._cs.nextChar)) {
+                this._cs.moveNext();
+                this._cs.moveNext();
+            }
+            else {
+                break;
+            }
+        }
     }
     _isPossibleNumber() {
         if ((0, characters_1.isDecimal)(this._cs.currentChar)) {
@@ -743,8 +1167,9 @@ class Tokenizer {
                 radix = 8;
             }
             if (radix > 0) {
-                const text = this._cs.getText().slice(start, this._cs.position);
-                const simpleIntText = text.replace(underscoreRegEx, '');
+                const end = this._cs.position;
+                const text = this._cs.getText();
+                const simpleIntText = removeUnderscoresFromRange(text, start, end);
                 let intValue = parseInt(simpleIntText.slice(leadingChars), radix);
                 if (!isNaN(intValue)) {
                     const bigIntValue = BigInt(simpleIntText);
@@ -753,7 +1178,7 @@ class Tokenizer {
                         intValue > Number.MAX_SAFE_INTEGER) {
                         intValue = bigIntValue;
                     }
-                    this._tokens.push(tokenizerTypes_1.NumberToken.create(start, text.length, intValue, true, false, this._getComments()));
+                    this._tokens.push(tokenizerTypes_1.NumberToken.create(start, end - start, intValue, true, false, this._getComments()));
                     return true;
                 }
             }
@@ -787,11 +1212,13 @@ class Tokenizer {
                     (this._cs.currentChar < 49 /* Char._1 */ || this._cs.currentChar > 57 /* Char._9 */);
         }
         if (isDecimalInteger) {
-            let text = this._cs.getText().slice(start, this._cs.position);
-            const simpleIntText = text.replace(underscoreRegEx, '');
+            const textEnd = this._cs.position;
+            const sourceText = this._cs.getText();
+            const simpleIntText = removeUnderscoresFromRange(sourceText, start, textEnd);
             let intValue = parseInt(simpleIntText, 10);
             if (!isNaN(intValue)) {
                 let isImaginary = false;
+                let tokenLength = textEnd - start;
                 const bigIntValue = BigInt(simpleIntText);
                 if (!isFinite(intValue) ||
                     bigIntValue < Number.MIN_SAFE_INTEGER ||
@@ -800,10 +1227,10 @@ class Tokenizer {
                 }
                 if (this._cs.currentChar === 106 /* Char.j */ || this._cs.currentChar === 74 /* Char.J */) {
                     isImaginary = true;
-                    text += String.fromCharCode(this._cs.currentChar);
                     this._cs.moveNext();
+                    tokenLength += 1;
                 }
-                this._tokens.push(tokenizerTypes_1.NumberToken.create(start, text.length, intValue, true, isImaginary, this._getComments()));
+                this._tokens.push(tokenizerTypes_1.NumberToken.create(start, tokenLength, intValue, true, isImaginary, this._getComments()));
                 return true;
             }
         }
@@ -812,16 +1239,18 @@ class Tokenizer {
         if (mightBeFloatingPoint ||
             (this._cs.currentChar === 46 /* Char.Period */ && this._cs.nextChar >= 48 /* Char._0 */ && this._cs.nextChar <= 57 /* Char._9 */)) {
             if (this._skipFloatingPointCandidate()) {
-                let text = this._cs.getText().slice(start, this._cs.position);
-                const value = parseFloat(text.replace(underscoreRegEx, ''));
+                const floatEnd = this._cs.position;
+                const floatText = removeUnderscoresFromRange(this._cs.getText(), start, floatEnd);
+                const value = parseFloat(floatText);
                 if (!isNaN(value)) {
                     let isImaginary = false;
+                    let tokenLength = floatEnd - start;
                     if (this._cs.currentChar === 106 /* Char.j */ || this._cs.currentChar === 74 /* Char.J */) {
                         isImaginary = true;
-                        text += String.fromCharCode(this._cs.currentChar);
                         this._cs.moveNext();
+                        tokenLength += 1;
                     }
-                    this._tokens.push(tokenizerTypes_1.NumberToken.create(start, this._cs.position - start, value, false, isImaginary, this._getComments()));
+                    this._tokens.push(tokenizerTypes_1.NumberToken.create(start, tokenLength, value, false, isImaginary, this._getComments()));
                     return true;
                 }
             }
@@ -830,122 +1259,63 @@ class Tokenizer {
         return false;
     }
     _tryOperator() {
+        const currentChar = this._cs.currentChar;
         let length = 0;
         const nextChar = this._cs.nextChar;
         let operatorType;
-        switch (this._cs.currentChar) {
-            case 43 /* Char.Plus */:
-                length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                operatorType = length === 2 ? 1 /* OperatorType.AddEqual */ : 0 /* OperatorType.Add */;
-                break;
-            case 38 /* Char.Ampersand */:
-                length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                operatorType = length === 2 ? 4 /* OperatorType.BitwiseAndEqual */ : 3 /* OperatorType.BitwiseAnd */;
-                break;
-            case 124 /* Char.Bar */:
-                length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                operatorType = length === 2 ? 7 /* OperatorType.BitwiseOrEqual */ : 6 /* OperatorType.BitwiseOr */;
-                break;
-            case 94 /* Char.Caret */:
-                length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                operatorType = length === 2 ? 9 /* OperatorType.BitwiseXorEqual */ : 8 /* OperatorType.BitwiseXor */;
-                break;
-            case 61 /* Char.Equal */:
-                if (this._activeFString?.activeReplacementField &&
-                    this._activeFString?.activeReplacementField.parenDepth === this._parenDepth &&
-                    !this._activeFString.activeReplacementField.inFormatSpecifier &&
-                    nextChar !== 61 /* Char.Equal */) {
-                    length = 1;
-                    operatorType = 2 /* OperatorType.Assign */;
-                    break;
-                }
-                length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                operatorType = length === 2 ? 12 /* OperatorType.Equals */ : 2 /* OperatorType.Assign */;
-                break;
-            case 33 /* Char.ExclamationMark */:
-                if (nextChar !== 61 /* Char.Equal */) {
-                    if (this._activeFString) {
-                        // Handle the conversion separator (!) within an f-string.
-                        this._tokens.push(tokenizerTypes_1.Token.create(23 /* TokenType.ExclamationMark */, this._cs.position, 1, this._getComments()));
-                        this._cs.advance(1);
-                        return true;
-                    }
-                    return false;
-                }
-                length = 2;
-                operatorType = 28 /* OperatorType.NotEquals */;
-                break;
-            case 37 /* Char.Percent */:
-                length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                operatorType = length === 2 ? 25 /* OperatorType.ModEqual */ : 24 /* OperatorType.Mod */;
-                break;
-            case 126 /* Char.Tilde */:
-                length = 1;
-                operatorType = 5 /* OperatorType.BitwiseInvert */;
-                break;
-            case 45 /* Char.Hyphen */:
-                if (nextChar === 62 /* Char.Greater */) {
-                    this._tokens.push(tokenizerTypes_1.Token.create(21 /* TokenType.Arrow */, this._cs.position, 2, this._getComments()));
-                    this._cs.advance(2);
+        if (currentChar < 128 && nextChar < 128) {
+            const twoCharKey = (currentChar << 8) | nextChar;
+            const specialTokenType = _twoCharSpecialTokenTypeMap.get(twoCharKey);
+            if (specialTokenType !== undefined) {
+                this._tokens.push(tokenizerTypes_1.Token.create(specialTokenType, this._cs.position, 2, this._getComments()));
+                this._cs.advance(2);
+                return true;
+            }
+            const twoCharOperatorType = _twoCharOperatorTypeMap.get(twoCharKey);
+            if (twoCharOperatorType !== undefined) {
+                this._tokens.push(tokenizerTypes_1.OperatorToken.create(this._cs.position, 2, twoCharOperatorType, this._getComments()));
+                this._cs.advance(2);
+                return true;
+            }
+            if (currentChar === nextChar) {
+                const repeatedOperatorType = _repeatedCharOperatorTypeTable[currentChar];
+                if (repeatedOperatorType !== _unsetSingleCharOperatorType) {
+                    const hasTrailingEqual = this._cs.lookAhead(2) === 61 /* Char.Equal */;
+                    const repeatedLength = hasTrailingEqual ? 3 : 2;
+                    const operatorType = hasTrailingEqual
+                        ? _repeatedCharEqualOperatorTypeTable[currentChar]
+                        : repeatedOperatorType;
+                    this._tokens.push(tokenizerTypes_1.OperatorToken.create(this._cs.position, repeatedLength, operatorType, this._getComments()));
+                    this._cs.advance(repeatedLength);
                     return true;
                 }
-                length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                operatorType = length === 2 ? 34 /* OperatorType.SubtractEqual */ : 33 /* OperatorType.Subtract */;
-                break;
-            case 42 /* Char.Asterisk */:
-                if (nextChar === 42 /* Char.Asterisk */) {
-                    length = this._cs.lookAhead(2) === 61 /* Char.Equal */ ? 3 : 2;
-                    operatorType = length === 3 ? 30 /* OperatorType.PowerEqual */ : 29 /* OperatorType.Power */;
-                }
-                else {
-                    length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                    operatorType = length === 2 ? 27 /* OperatorType.MultiplyEqual */ : 26 /* OperatorType.Multiply */;
-                }
-                break;
-            case 47 /* Char.Slash */:
-                if (nextChar === 47 /* Char.Slash */) {
-                    length = this._cs.lookAhead(2) === 61 /* Char.Equal */ ? 3 : 2;
-                    operatorType = length === 3 ? 14 /* OperatorType.FloorDivideEqual */ : 13 /* OperatorType.FloorDivide */;
-                }
-                else {
-                    length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                    operatorType = length === 2 ? 11 /* OperatorType.DivideEqual */ : 10 /* OperatorType.Divide */;
-                }
-                break;
-            case 60 /* Char.Less */:
-                if (nextChar === 60 /* Char.Less */) {
-                    length = this._cs.lookAhead(2) === 61 /* Char.Equal */ ? 3 : 2;
-                    operatorType = length === 3 ? 18 /* OperatorType.LeftShiftEqual */ : 17 /* OperatorType.LeftShift */;
-                }
-                else if (nextChar === 62 /* Char.Greater */) {
+            }
+        }
+        if (currentChar < 128) {
+            const singleCharOperatorType = _singleCharOperatorTypeTable[currentChar];
+            if (singleCharOperatorType !== _unsetSingleCharOperatorType) {
+                const equalOperatorType = _singleCharEqualOperatorTypeTable[currentChar];
+                if (nextChar === 61 /* Char.Equal */ && equalOperatorType !== _unsetSingleCharOperatorType) {
                     length = 2;
-                    operatorType = 19 /* OperatorType.LessOrGreaterThan */;
+                    operatorType = equalOperatorType;
                 }
                 else {
-                    length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                    operatorType = length === 2 ? 21 /* OperatorType.LessThanOrEqual */ : 20 /* OperatorType.LessThan */;
-                }
-                break;
-            case 62 /* Char.Greater */:
-                if (nextChar === 62 /* Char.Greater */) {
-                    length = this._cs.lookAhead(2) === 61 /* Char.Equal */ ? 3 : 2;
-                    operatorType = length === 3 ? 32 /* OperatorType.RightShiftEqual */ : 31 /* OperatorType.RightShift */;
-                }
-                else {
-                    length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                    operatorType = length === 2 ? 16 /* OperatorType.GreaterThanOrEqual */ : 15 /* OperatorType.GreaterThan */;
+                    length = 1;
+                    operatorType = singleCharOperatorType;
                 }
-                break;
-            case 64 /* Char.At */:
-                length = nextChar === 61 /* Char.Equal */ ? 2 : 1;
-                operatorType = length === 2 ? 23 /* OperatorType.MatrixMultiplyEqual */ : 22 /* OperatorType.MatrixMultiply */;
-                break;
-            default:
-                return false;
+                this._tokens.push(tokenizerTypes_1.OperatorToken.create(this._cs.position, length, operatorType, this._getComments()));
+                this._cs.advance(length);
+                return true;
+            }
         }
-        this._tokens.push(tokenizerTypes_1.OperatorToken.create(this._cs.position, length, operatorType, this._getComments()));
-        this._cs.advance(length);
-        return length > 0;
+        // `!=` is handled by the 2-char fast path above.
+        if (currentChar === 33 /* Char.ExclamationMark */ && this._activeFString) {
+            // Handle the conversion separator (!) within an f-string.
+            this._tokens.push(tokenizerTypes_1.Token.create(23 /* TokenType.ExclamationMark */, this._cs.position, 1, this._getComments()));
+            this._cs.advance(1);
+            return true;
+        }
+        return false;
     }
     _handleInvalid() {
         const start = this._cs.position;
@@ -994,16 +1364,15 @@ class Tokenizer {
     }
     _handleIPythonMagics(type) {
         const start = this._cs.position + 1;
+        const sourceText = this._cs.getText();
         let begin = start;
         while (true) {
             this._cs.skipToEol();
             if (type === 1 /* CommentType.IPythonMagic */ || type === 2 /* CommentType.IPythonShellEscape */) {
-                const length = this._cs.position - begin;
-                const value = this._cs.getText().slice(begin, begin + length);
                 // is it multiline magics?
                 // %magic command \
                 //        next arguments
-                if (!value.match(magicsRegEx)) {
+                if (!endsWithBackslashContinuation(sourceText, begin, this._cs.position)) {
                     break;
                 }
             }
@@ -1014,55 +1383,74 @@ class Tokenizer {
             }
         }
         const length = this._cs.position - start;
-        const comment = tokenizerTypes_1.Comment.create(start, length, this._cs.getText().slice(start, start + length), type);
+        const comment = tokenizerTypes_1.Comment.create(start, length, sourceText.slice(start, start + length), type);
         this._addComments(comment);
     }
     _handleComment() {
         const start = this._cs.position + 1;
         this._cs.skipToEol();
         const length = this._cs.position - start;
-        const comment = tokenizerTypes_1.Comment.create(start, length, this._cs.getText().slice(start, start + length));
-        const typeIgnoreRegexMatch = comment.value.match(typeIgnoreCommentRegEx);
-        if (typeIgnoreRegexMatch) {
-            const commentStart = start + (typeIgnoreRegexMatch.index ?? 0);
-            const textRange = {
-                start: commentStart + typeIgnoreRegexMatch[1].length,
-                length: typeIgnoreRegexMatch[0].length - typeIgnoreRegexMatch[1].length,
-            };
-            const ignoreComment = {
-                range: textRange,
-                rulesList: this._getIgnoreCommentRulesList(commentStart, typeIgnoreRegexMatch),
-            };
-            if (this._tokens.findIndex((t) => t.type !== 2 /* TokenType.NewLine */ && t && t.type !== 3 /* TokenType.Indent */) < 0) {
-                this._typeIgnoreAll = ignoreComment;
+        const sourceText = this._cs.getText();
+        const end = start + length;
+        // Fast pre-filter: any ignore directive must contain the substring 'ignore'.
+        // indexOf is a highly-optimized native call and lets us skip the full
+        // directive scan for the vast majority of comments (which are free-form text).
+        const ignoreIdx = sourceText.indexOf('ignore', start);
+        if (ignoreIdx >= 0 && ignoreIdx < end) {
+            const typeIgnoreMatch = matchIgnoreDirective(sourceText, start, end, 'type');
+            if (typeIgnoreMatch) {
+                const commentStart = typeIgnoreMatch.index;
+                const textRange = {
+                    start: commentStart + typeIgnoreMatch.prefix.length,
+                    length: typeIgnoreMatch.fullMatch.length - typeIgnoreMatch.prefix.length,
+                };
+                const ignoreComment = {
+                    range: textRange,
+                    rulesList: this._getIgnoreCommentRulesList(commentStart, typeIgnoreMatch),
+                };
+                let isIgnoreAll = false;
+                if (!this._hasTokenBeforeIgnoreAll) {
+                    // Are there any tokens other than NewLine / Indent yet?
+                    const hasOther = this._tokens.some((t) => t && t.type !== 2 /* TokenType.NewLine */ && t.type !== 3 /* TokenType.Indent */);
+                    if (hasOther) {
+                        this._hasTokenBeforeIgnoreAll = true;
+                    }
+                    else {
+                        isIgnoreAll = true;
+                    }
+                }
+                if (isIgnoreAll) {
+                    this._typeIgnoreAll = ignoreComment;
+                }
+                else {
+                    this._typeIgnoreLines.set(this._lineRanges.length, ignoreComment);
+                }
             }
-            else {
-                this._typeIgnoreLines.set(this._lineRanges.length, ignoreComment);
+            const pyrightIgnoreMatch = matchIgnoreDirective(sourceText, start, end, 'pyright');
+            if (pyrightIgnoreMatch) {
+                const commentStart = pyrightIgnoreMatch.index;
+                const textRange = {
+                    start: commentStart + pyrightIgnoreMatch.prefix.length,
+                    length: pyrightIgnoreMatch.fullMatch.length - pyrightIgnoreMatch.prefix.length,
+                };
+                const ignoreComment = {
+                    range: textRange,
+                    rulesList: this._getIgnoreCommentRulesList(commentStart, pyrightIgnoreMatch),
+                };
+                this._pyrightIgnoreLines.set(this._lineRanges.length, ignoreComment);
             }
         }
-        const pyrightIgnoreRegexMatch = comment.value.match(pyrightIgnoreCommentRegEx);
-        if (pyrightIgnoreRegexMatch) {
-            const commentStart = start + (pyrightIgnoreRegexMatch.index ?? 0);
-            const textRange = {
-                start: commentStart + pyrightIgnoreRegexMatch[1].length,
-                length: pyrightIgnoreRegexMatch[0].length - pyrightIgnoreRegexMatch[1].length,
-            };
-            const ignoreComment = {
-                range: textRange,
-                rulesList: this._getIgnoreCommentRulesList(commentStart, pyrightIgnoreRegexMatch),
-            };
-            this._pyrightIgnoreLines.set(this._lineRanges.length, ignoreComment);
-        }
+        const comment = tokenizerTypes_1.Comment.create(start, length, sourceText.slice(start, end));
         this._addComments(comment);
     }
     // Extracts the individual rules within a "type: ignore [x, y, z]" comment.
     _getIgnoreCommentRulesList(start, match) {
-        if (match.length < 5 || match[4] === undefined) {
+        if (match.bracketContent === undefined) {
             return undefined;
         }
-        const splitElements = match[4].split(',');
+        const splitElements = match.bracketContent.split(',');
         const commentRules = [];
-        let currentOffset = start + match[0].indexOf('[') + 1;
+        let currentOffset = start + match.fullMatch.indexOf('[') + 1;
         for (const element of splitElements) {
             const frontTrimmed = element.trimStart();
             currentOffset += element.length - frontTrimmed.length;
@@ -1417,4 +1805,12 @@ class Tokenizer {
     }
 }
 exports.Tokenizer = Tokenizer;
+// Direct-mapped identifier intern cache. Indexed by a cheap hash of
+// (firstChar, lastChar, length). On a hit (slot defined and string
+// equals the current source range), reuse the cached string instead of
+// re-allocating via detachSubstring. Collisions simply overwrite the
+// slot — no chaining, O(1) lookup, no Map overhead. Sized as a power of
+// two so the mask is a single AND.
+Tokenizer._identifierCacheSize = 2048;
+Tokenizer._identifierCacheMask = Tokenizer._identifierCacheSize - 1;
 //# sourceMappingURL=tokenizer.js.map