npm - @nlptools/distance - Versions diffs - 0.0.4 → 0.0.5 - Mend

@nlptools/distance 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -257,46 +257,46 @@ const result = diff("abc", "ac");
 ## Performance
-Benchmark: 1000 iterations per pair, same test data across all runtimes.
+Benchmark: same test data across all runtimes. TS/WASM via `vitest bench` (V8 JIT), Rust via `cargo test --release`.
 Unit: microseconds per operation (us/op).
 ### Edit Distance
 | Algorithm       | Size            | TS (V8 JIT) | WASM (via JS) | Rust (native) |
 | --------------- | --------------- | ----------- | ------------- | ------------- |
-| levenshtein     | Short (<10)     | 0.3         | 7.9           | 0.11          |
-| levenshtein     | Medium (10-100) | 1.3         | 116.2         | 0.98          |
-| levenshtein     | Long (>200)     | 15.2        | 2,877.5       | 39.68         |
-| levenshteinNorm | Short           | 0.3         | 7.9           | 0.11          |
-| lcs             | Short (<10)     | 1.6         | 16.5          | 0.41          |
-| lcs             | Medium (10-100) | 6.8         | 272.6         | 3.22          |
-| lcs             | Long (>200)     | 217.8       | 6,574.1       | 122.63        |
-| lcsNorm         | Short           | 1.7         | 16.2          | 0.48          |
+| levenshtein     | Short (<10)     | 0.3         | 1.0           | 0.24          |
+| levenshtein     | Medium (10-100) | 1.3         | 4.8           | 2.00          |
+| levenshtein     | Long (>200)     | 13.9        | 102.3         | 61.77         |
+| levenshteinNorm | Short           | 0.3         | 1.0           | 0.19          |
+| lcs             | Short (<10)     | 1.7         | 1.9           | 0.69          |
+| lcs             | Medium (10-100) | 6.8         | 10.1          | 7.70          |
+| lcs             | Long (>200)     | 216.0       | 161.8         | 151.84        |
+| lcsNorm         | Short           | 1.7         | 1.9           | 0.42          |
 ### Token Similarity (Character Multiset)
 | Algorithm | Size            | TS (V8 JIT) | WASM (via JS) | Rust (native) |
 | --------- | --------------- | ----------- | ------------- | ------------- |
-| jaccard   | Short (<10)     | 0.8         | 25.2          | 0.42          |
-| jaccard   | Medium (10-100) | 0.8         | 74.3          | 1.55          |
-| jaccard   | Long (>200)     | 1.6         | 171.5         | 5.54          |
-| cosine    | Short (<10)     | 0.8         | 19.3          | 0.32          |
-| cosine    | Medium (10-100) | 0.8         | 61.4          | 1.35          |
-| cosine    | Long (>200)     | 1.5         | 158.5         | 4.77          |
-| sorensen  | Short (<10)     | 0.7         | 19.3          | 0.33          |
-| sorensen  | Medium (10-100) | 0.7         | 61.0          | 1.33          |
-| sorensen  | Long (>200)     | 1.5         | 160.0         | 4.46          |
+| jaccard   | Short (<10)     | 0.8         | 3.4           | 0.63          |
+| jaccard   | Medium (10-100) | 0.8         | 8.6           | 2.67          |
+| jaccard   | Long (>200)     | 1.5         | 18.9          | 7.25          |
+| cosine    | Short (<10)     | 1.0         | 2.6           | 0.43          |
+| cosine    | Medium (10-100) | 0.8         | 7.0           | 1.56          |
+| cosine    | Long (>200)     | 1.7         | 17.2          | 6.23          |
+| sorensen  | Short (<10)     | 0.7         | 2.6           | 0.56          |
+| sorensen  | Medium (10-100) | 0.7         | 7.0           | 2.27          |
+| sorensen  | Long (>200)     | 1.4         | 17.4          | 6.48          |
 ### Bigram Variants
 | Algorithm     | Size            | TS (V8 JIT) | WASM (via JS) | Rust (native) |
 | ------------- | --------------- | ----------- | ------------- | ------------- |
-| jaccardBigram | Short (<10)     | 1.1         | 27.4          | 0.45          |
-| jaccardBigram | Medium (10-100) | 7.7         | 160.4         | 3.86          |
-| cosineBigram  | Short (<10)     | 0.8         | 21.2          | 0.36          |
-| cosineBigram  | Medium (10-100) | 5.9         | 127.0         | 3.12          |
+| jaccardBigram | Short (<10)     | 1.1         | 3.5           | 0.67          |
+| jaccardBigram | Medium (10-100) | 7.5         | 18.1          | 4.80          |
+| cosineBigram  | Short (<10)     | 0.7         | 2.8           | 0.43          |
+| cosineBigram  | Medium (10-100) | 5.4         | 14.0          | 4.04          |
-TS implementations use V8 JIT optimization + `Int32Array` ASCII fast path + integer-encoded bigrams, avoiding JS-WASM boundary overhead entirely.
+TS implementations use `Int32Array` ASCII fast path + integer-encoded bigrams, avoiding JS-WASM boundary overhead. For compute-heavy algorithms on long strings (e.g. LCS), WASM via JS and Rust native can outperform TS due to native computation advantage outweighing the boundary cost.
 ### Fuzzy Search: NLPTools vs Fuse.js

package/dist/index.d.mts CHANGED Viewed

@@ -65,6 +65,260 @@ declare function lcsLength(a: string, b: string, algorithm?: "myers" | "dp"): nu
  */
 declare function lcsPairs(a: string, b: string, algorithm?: "myers" | "dp"): Array<[number, number]>;
 //#endregion
+//#region src/edit/jaro.d.ts
+/**
+ * Jaro and Jaro-Winkler similarity algorithms.
+ *
+ * Jaro measures similarity between two strings by considering matching characters
+ * and transpositions. Jaro-Winkler extends Jaro with a prefix bonus.
+ *
+ * Time: O(m * n)
+ */
+/**
+ * Compute Jaro similarity between two strings.
+ *
+ * J(S1, S2) = (1/3) * (m/|S1| + m/|S2| + (m - t/2) / m)
+ *
+ * where m = number of matching characters (within window),
+ *       t = number of transpositions among matching characters.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Jaro similarity in [0, 1]
+ */
+declare function jaro(a: string, b: string): number;
+/**
+ * Options for Jaro-Winkler similarity.
+ */
+interface IJaroWinklerOptions {
+  /**
+   * Weight applied to the common prefix bonus.
+   * @default 0.1
+   */
+  prefixWeight?: number;
+  /**
+   * Maximum length of common prefix to consider.
+   * @default 4
+   */
+  maxPrefix?: number;
+}
+/**
+ * Compute Jaro-Winkler similarity between two strings.
+ *
+ * JW(S1, S2) = Jaro(S1, S2) + l * p * (1 - Jaro(S1, S2))
+ *
+ * where l = length of common prefix (up to maxPrefix),
+ *       p = prefixWeight.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @param options - Configuration
+ * @returns Jaro-Winkler similarity in [0, 1]
+ */
+declare function jaroWinkler(a: string, b: string, options?: IJaroWinklerOptions): number;
+//#endregion
+//#region src/edit/damerau.d.ts
+/**
+ * Damerau-Levenshtein distance (unrestricted variant).
+ *
+ * Extension of Levenshtein that allows transpositions of adjacent characters,
+ * even when substrings are edited multiple times.
+ *
+ * Matches the default behavior of textdistance.rs (restricted = false).
+ *
+ * Time: O(m * n), Space: O(m * n)
+ */
+/**
+ * Compute the Damerau-Levenshtein distance between two strings.
+ *
+ * Allows insertions, deletions, substitutions, and transpositions of
+ * adjacent characters. This is the unrestricted variant, which correctly
+ * handles cases where a substring is edited more than once.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Edit distance (non-negative integer)
+ */
+declare function damerauLevenshtein(a: string, b: string): number;
+/**
+ * Compute the normalized Damerau-Levenshtein similarity in [0, 1].
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Similarity score where 1 means identical
+ */
+declare function damerauLevenshteinNormalized(a: string, b: string): number;
+//#endregion
+//#region src/edit/hamming.d.ts
+/**
+ * Hamming distance — counts character mismatches between equal-length strings.
+ *
+ * Time: O(min(m, n))
+ */
+/**
+ * Compute the Hamming distance between two strings.
+ *
+ * If strings have different lengths, only compares up to the shorter length
+ * and adds the length difference as additional mismatches.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Number of mismatching characters
+ */
+declare function hamming(a: string, b: string): number;
+/**
+ * Compute the normalized Hamming similarity in [0, 1].
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Similarity score where 1 means identical
+ */
+declare function hammingNormalized(a: string, b: string): number;
+//#endregion
+//#region src/edit/lcs-str.d.ts
+/**
+ * Longest Common Substring (contiguous) algorithms.
+ *
+ * Unlike LCS (subsequence), this requires the matching characters to be contiguous.
+ *
+ * Time: O(m * n), Space: O(min(m, n))
+ */
+/**
+ * Compute the length of the Longest Common Substring.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Length of the longest common substring
+ */
+declare function lcsSubstringLength(a: string, b: string): number;
+/**
+ * Compute the LCS substring distance: len(a) + len(b) - 2 * lcsSubstringLength.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns LCS substring distance (non-negative integer)
+ */
+declare function lcsSubstringDistance(a: string, b: string): number;
+/**
+ * Compute the normalized LCS substring similarity in [0, 1].
+ *
+ * Normalized by max(len(a), len(b)) to match textdistance.rs convention.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Similarity score where 1 means identical
+ */
+declare function lcsSubstringNormalized(a: string, b: string): number;
+//#endregion
+//#region src/edit/sift4.d.ts
+/**
+ * SIFT4 simple — fast approximate string distance.
+ *
+ * A fast algorithm for approximate string matching with O(n) complexity
+ * in typical cases. Returns a distance value (lower = more similar).
+ *
+ * Matches the textdistance.rs implementation exactly.
+ *
+ * Time: O(n * maxOffset)
+ */
+/**
+ * Options for SIFT4.
+ */
+interface ISift4Options {
+  /**
+   * Maximum offset for character matching.
+   * @default 5
+   */
+  maxOffset?: number;
+}
+/**
+ * Compute the SIFT4 simple distance between two strings.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @param options - Configuration
+ * @returns Distance (non-negative integer)
+ */
+declare function sift4(a: string, b: string, options?: ISift4Options): number;
+/**
+ * Compute the normalized SIFT4 similarity in [0, 1].
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @param options - Configuration
+ * @returns Similarity score where 1 means identical
+ */
+declare function sift4Normalized(a: string, b: string, options?: ISift4Options): number;
+//#endregion
+//#region src/edit/ratcliff.d.ts
+/**
+ * Ratcliff-Obershelp algorithm — Gestalt pattern matching.
+ *
+ * Iteratively finds the longest common substring using a stack-based approach,
+ * combining scores from both sides. Returns a similarity in [0, 1].
+ *
+ * Based on the textdistance.rs implementation.
+ *
+ * Time: O(n * m * log(n * m)) worst case, O(n + m) average
+ */
+/**
+ * Compute Ratcliff-Obershelp similarity between two strings.
+ *
+ * Uses an iterative stack-based approach to avoid stack overflow on
+ * very different strings. The algorithm recursively finds the longest
+ * common substring and combines similarity scores from both sides.
+ *
+ * similarity = 2 * M / T, where M = total matched characters, T = total characters
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Ratcliff-Obershelp similarity in [0, 1]
+ */
+declare function ratcliff(a: string, b: string): number;
+//#endregion
+//#region src/edit/smith-waterman.d.ts
+/**
+ * Smith-Waterman local sequence alignment algorithm.
+ *
+ * Designed for biological sequence alignment, it finds the best
+ * local alignment between two sequences.
+ *
+ * Default scoring: match=1, mismatch=0, gap=-1 (matches textdistance.rs)
+ *
+ * Time: O(m * n), Space: O(m * n)
+ */
+/**
+ * Options for Smith-Waterman alignment.
+ */
+interface ISmithWatermanOptions {
+  /** Score for matching characters. @default 1 */
+  matchScore?: number;
+  /** Score for mismatching characters. @default 0 */
+  mismatchScore?: number;
+  /** Score penalty for a gap. @default -1 */
+  gapScore?: number;
+}
+/**
+ * Compute the raw Smith-Waterman alignment score.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @param options - Scoring parameters
+ * @returns Raw alignment score (non-negative)
+ */
+declare function smithWaterman(a: string, b: string, options?: ISmithWatermanOptions): number;
+/**
+ * Compute the normalized Smith-Waterman similarity in [0, 1].
+ *
+ * Normalized by matchScore * max(len(a), len(b)), matching textdistance.rs convention.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @param options - Scoring parameters
+ * @returns Normalized similarity in [0, 1]
+ */
+declare function smithWatermanNormalized(a: string, b: string, options?: ISmithWatermanOptions): number;
+//#endregion
 //#region src/token/jaccard.d.ts
 /**
  * Jaccard similarity between two strings based on character-level multiset.
@@ -95,10 +349,10 @@ declare function jaccardNgram(a: string, b: string, n?: number): number;
 /**
  * Cosine similarity between two strings based on character-level multiset.
  *
- * cos(A, B) = (A · B) / (|A| * |B|)
+ * Uses textdistance.rs convention:
+ *   cosine(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
  *
- * Uses Counter (frequency map) for multiset semantics,
- * matching the textdistance crate behavior.
+ * Where intersect_count = sum(min(freqA[c], freqB[c])) and count = sum of frequencies.
  *
  * Time: O(m + n)
  *
@@ -110,6 +364,9 @@ declare function cosine(a: string, b: string): number;
 /**
  * Cosine similarity based on character n-grams.
  *
+ * Uses textdistance.rs convention (same as character-level cosine but on n-grams):
+ *   cosine_ngram(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
+ *
  * @param a - First string
  * @param b - Second string
  * @param n - N-gram size (default: 2)
@@ -143,6 +400,96 @@ declare function sorensen(a: string, b: string): number;
  */
 declare function sorensenNgram(a: string, b: string, n?: number): number;
 //#endregion
+//#region src/token/tversky.d.ts
+/**
+ * Tversky index — asymmetric set similarity measure.
+ *
+ * Reduces to Jaccard when alpha = beta = 1.
+ * Reduces to Sorensen-Dice when alpha = beta = 0.5.
+ *
+ * Time: O(m + n)
+ */
+/**
+ * Options for Tversky index.
+ */
+interface ITverskyOptions {
+  /**
+   * Weight for elements unique to the first set (a).
+   * @default 1
+   */
+  alpha?: number;
+  /**
+   * Weight for elements unique to the second set (b).
+   * @default 1
+   */
+  beta?: number;
+}
+/**
+ * Compute the Tversky index between two strings based on character multiset.
+ *
+ * T(A, B; α, β) = |A ∩ B| / (|A ∩ B| + α|A \ B| + β|B \ A|)
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @param options - alpha and beta weights
+ * @returns Tversky index in [0, 1]
+ */
+declare function tversky(a: string, b: string, options?: ITverskyOptions): number;
+//#endregion
+//#region src/token/overlap.d.ts
+/**
+ * Overlap coefficient — set similarity normalized by the smaller set.
+ *
+ * overlap(A, B) = |A ∩ B| / min(|A|, |B|)
+ *
+ * Time: O(m + n)
+ */
+/**
+ * Compute the overlap coefficient between two strings based on character multiset.
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Overlap coefficient in [0, 1]
+ */
+declare function overlap(a: string, b: string): number;
+//#endregion
+//#region src/token/naive.d.ts
+/**
+ * Naive string similarity measures: prefix, suffix, length.
+ *
+ * Time: O(min(m, n)) for prefix/suffix, O(1) for length
+ */
+/**
+ * Compute prefix similarity between two strings.
+ *
+ * prefix(a, b) = commonPrefixLength / max(|a|, |b|)
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Prefix similarity in [0, 1]
+ */
+declare function prefix(a: string, b: string): number;
+/**
+ * Compute suffix similarity between two strings.
+ *
+ * suffix(a, b) = commonSuffixLength / max(|a|, |b|)
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Suffix similarity in [0, 1]
+ */
+declare function suffix(a: string, b: string): number;
+/**
+ * Compute length-based similarity between two strings.
+ *
+ * length(a, b) = 1 - |len(a) - len(b)| / max(len(a), len(b))
+ *
+ * @param a - First string
+ * @param b - Second string
+ * @returns Normalized length similarity in [0, 1]
+ */
+declare function length(a: string, b: string): number;
+//#endregion
 //#region src/hash/simhash.d.ts
 interface ISimHashOptions {
   /**
@@ -501,10 +848,10 @@ interface IFuzzySearchOptions {
    */
   limit?: number;
   /**
-   * Whether search should be case-insensitive.
-   * When true, both the query and the item strings are lowercased
-   * before comparison.
-   * @default false (case-insensitive by default)
+   * Whether search should be case-sensitive.
+   * When false (default), both the query and the item strings are lowercased
+   * before comparison (case-insensitive search).
+   * @default false
    */
   caseSensitive?: boolean;
   /**
@@ -677,4 +1024,72 @@ declare class FuzzySearch<T> {
  */
 declare function findBestMatch<T>(query: string, collection: ReadonlyArray<T>, options?: IFindBestMatchOptions): ISearchResult<T> | null;
 //#endregion
-export { BuiltinAlgorithm, DiffType, FuzzySearch, type IDiffItem, type IDiffOptions, IFindBestMatchOptions, IFuzzySearchOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISearchKey, ISearchResult, ISearchResultWithDetails, ISimHashOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, SimilarityFn, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
+//#region src/utils.d.ts
+/**
+ * Generate character n-grams from a string.
+ *
+ * @param str - Input string
+ * @param n - N-gram size (default: 2 for bigrams)
+ */
+declare function ngrams(str: string, n?: number): string[];
+/**
+ * Build an n-gram frequency map using integer-encoded keys.
+ * Encodes n characters into a single number to avoid string allocation
+ * and speed up Map hashing.
+ *
+ * For ASCII bigrams: key = (c1 << 8) | c2 (fits in 16 bits).
+ * For non-ASCII or n > 2: falls back to string keys.
+ */
+declare function ngramFrequencyMap(str: string, n?: number): Map<number, number> | null;
+/**
+ * Build a frequency map (Counter/multiset) from an iterable of tokens.
+ * Matches the behavior of Rust's textdistance Counter.
+ */
+declare function frequencyMap(tokens: Iterable<string>): Map<string, number>;
+/**
+ * Build a character-level frequency map from a string.
+ * This is the default tokenization strategy used by textdistance.
+ */
+declare function charFrequencyMap(str: string): Map<string, number>;
+/** Size of the ASCII frequency array (covers charCode 0-127). */
+declare const CHAR_FREQ_SIZE = 128;
+/**
+ * Build a character frequency array from a string.
+ * Returns false if any character is non-ASCII (charCode >= 128).
+ * The caller must zero the array before use.
+ */
+declare function buildCharFreqArray(arr: Int32Array, str: string): boolean;
+/**
+ * Count intersect size between two frequency maps.
+ * For each key, takes the minimum count (multiset intersection).
+ */
+declare function intersectCount(a: Map<string, number>, b: Map<string, number>): number;
+/**
+ * Count union size between two frequency maps.
+ * For each key, takes the maximum count (multiset union).
+ */
+declare function unionCount(a: Map<string, number>, b: Map<string, number>): number;
+/**
+ * Get total token count from a frequency map.
+ */
+declare function totalCount(map: Map<string, number>): number;
+declare function intersectCountInt(a: Map<number, number>, b: Map<number, number>): number;
+declare function unionCountInt(a: Map<number, number>, b: Map<number, number>): number;
+declare function totalCountInt(map: Map<number, number>): number;
+/**
+ * Normalize a raw distance to a similarity score in [0, 1].
+ *
+ * @param distance - Raw distance value
+ * @param maxDistance - Maximum possible distance (usually max(len(a), len(b)))
+ */
+declare function normalize(distance: number, maxDistance: number): number;
+/**
+ * FNV-1a hash for strings. Fast, good distribution for hash-based algorithms.
+ */
+declare function fnv1a(str: string): number;
+/**
+ * Combine two hashes into one (for generating multiple independent hash values).
+ */
+declare function combineHash(a: number, b: number): number;
+//#endregion
+export { BuiltinAlgorithm, CHAR_FREQ_SIZE, DiffType, FuzzySearch, type IDiffItem, type IDiffOptions, IFindBestMatchOptions, IFuzzySearchOptions, IJaroWinklerOptions, ILSHOptions, type ILcs, type ILcsAlgorithm, IMinHashOptions, ISearchKey, ISearchResult, ISearchResultWithDetails, ISift4Options, ISimHashOptions, ISmithWatermanOptions, ITverskyOptions, LSH, LcsPairsFunc, LcsSizeFunc, MinHash, SimHasher, SimilarityFn, buildCharFreqArray, charFrequencyMap, combineHash, cosine, cosineNgram, damerauLevenshtein, damerauLevenshteinNormalized, diff, findBestMatch, fnv1a, frequencyMap, hamming, hammingDistance, hammingNormalized, hammingSimilarity, intersectCount, intersectCountInt, jaccard, jaccardNgram, jaro, jaroWinkler, lcsDistance, lcsLength, lcsNormalized, lcsPairs, lcsSubstringDistance, lcsSubstringLength, lcsSubstringNormalized, length, levenshtein, levenshteinNormalized, ngramFrequencyMap, ngrams, normalize, overlap, prefix, ratcliff, sift4, sift4Normalized, simhash, smithWaterman, smithWatermanNormalized, sorensen, sorensenNgram, stringEquals, suffix, totalCount, totalCountInt, tversky, unionCount, unionCountInt };

package/dist/index.mjs CHANGED Viewed

@@ -53,6 +53,8 @@ function frequencyMap(tokens) {
 function charFrequencyMap(str) {
 	return frequencyMap(str);
 }
+/** Size of the ASCII frequency array (covers charCode 0-127). */
+const CHAR_FREQ_SIZE = 128;
 /**
 * Build a character frequency array from a string.
 * Returns false if any character is non-ASCII (charCode >= 128).
@@ -148,6 +150,13 @@ function fnv1a(str) {
 	}
 	return hash >>> 0;
 }
+/**
+* Combine two hashes into one (for generating multiple independent hash values).
+*/
+function combineHash(a, b) {
+	a ^= b + 2654435769 + (a << 6) + (a >>> 2);
+	return a >>> 0;
+}
 //#endregion
 //#region src/edit/levenshtein.ts
 /**
@@ -206,7 +215,9 @@ function lcsDistance(a, b, algorithm = "myers") {
 * @returns Similarity score where 1 means identical
 */
 function lcsNormalized(a, b, algorithm = "myers") {
-	return normalize(lcsDistance(a, b, algorithm), a.length + b.length);
+	const maxLen = Math.max(a.length, b.length);
+	if (maxLen === 0) return 1;
+	return lcsLength(a, b, algorithm) / maxLen;
 }
 /**
 * Get the length of the Longest Common Subsequence.
@@ -231,6 +242,458 @@ function lcsPairs(a, b, algorithm = "myers") {
 	return (algorithm === "dp" ? lcs_dp : lcs_myers_linear_space)(a.length, b.length, stringEquals(a, b));
 }
 //#endregion
+//#region src/edit/jaro.ts
+/**
+* Jaro and Jaro-Winkler similarity algorithms.
+*
+* Jaro measures similarity between two strings by considering matching characters
+* and transpositions. Jaro-Winkler extends Jaro with a prefix bonus.
+*
+* Time: O(m * n)
+*/
+/**
+* Compute Jaro similarity between two strings.
+*
+* J(S1, S2) = (1/3) * (m/|S1| + m/|S2| + (m - t/2) / m)
+*
+* where m = number of matching characters (within window),
+*       t = number of transpositions among matching characters.
+*
+* @param a - First string
+* @param b - Second string
+* @returns Jaro similarity in [0, 1]
+*/
+function jaro(a, b) {
+	const aLen = a.length;
+	const bLen = b.length;
+	if (aLen === 0 && bLen === 0) return 1;
+	if (aLen === 0 || bLen === 0) return 0;
+	const matchDistance = Math.floor(Math.max(aLen, bLen) / 2) - 1;
+	if (matchDistance < 0) return 0;
+	const aMatches = new Uint8Array(aLen);
+	const bMatches = new Uint8Array(bLen);
+	let matches = 0;
+	let transpositions = 0;
+	for (let i = 0; i < aLen; i++) {
+		const start = Math.max(0, i - matchDistance);
+		const end = Math.min(i + matchDistance + 1, bLen);
+		for (let j = start; j < end; j++) {
+			if (bMatches[j] || a.charCodeAt(i) !== b.charCodeAt(j)) continue;
+			aMatches[i] = 1;
+			bMatches[j] = 1;
+			matches++;
+			break;
+		}
+	}
+	if (matches === 0) return 0;
+	let k = 0;
+	for (let i = 0; i < aLen; i++) {
+		if (!aMatches[i]) continue;
+		while (!bMatches[k]) k++;
+		if (a.charCodeAt(i) !== b.charCodeAt(k)) transpositions++;
+		k++;
+	}
+	return (matches / aLen + matches / bLen + (matches - transpositions / 2) / matches) / 3;
+}
+/**
+* Compute Jaro-Winkler similarity between two strings.
+*
+* JW(S1, S2) = Jaro(S1, S2) + l * p * (1 - Jaro(S1, S2))
+*
+* where l = length of common prefix (up to maxPrefix),
+*       p = prefixWeight.
+*
+* @param a - First string
+* @param b - Second string
+* @param options - Configuration
+* @returns Jaro-Winkler similarity in [0, 1]
+*/
+function jaroWinkler(a, b, options = {}) {
+	const p = options.prefixWeight ?? .1;
+	const maxPrefix = options.maxPrefix ?? 4;
+	const jaroScore = jaro(a, b);
+	let l = 0;
+	const minLen = Math.min(a.length, b.length, maxPrefix);
+	while (l < minLen && a.charCodeAt(l) === b.charCodeAt(l)) l++;
+	return jaroScore + l * p * (1 - jaroScore);
+}
+//#endregion
+//#region src/edit/damerau.ts
+/**
+* Damerau-Levenshtein distance (unrestricted variant).
+*
+* Extension of Levenshtein that allows transpositions of adjacent characters,
+* even when substrings are edited multiple times.
+*
+* Matches the default behavior of textdistance.rs (restricted = false).
+*
+* Time: O(m * n), Space: O(m * n)
+*/
+/**
+* Compute the Damerau-Levenshtein distance between two strings.
+*
+* Allows insertions, deletions, substitutions, and transpositions of
+* adjacent characters. This is the unrestricted variant, which correctly
+* handles cases where a substring is edited more than once.
+*
+* @param a - First string
+* @param b - Second string
+* @returns Edit distance (non-negative integer)
+*/
+function damerauLevenshtein(a, b) {
+	const aLen = a.length;
+	const bLen = b.length;
+	if (aLen === 0) return bLen;
+	if (bLen === 0) return aLen;
+	const maxDist = aLen + bLen;
+	const w = bLen + 2;
+	const mat = new Uint32Array((aLen + 2) * w);
+	mat[0] = maxDist;
+	for (let i = 0; i <= aLen; i++) {
+		mat[(i + 1) * w] = maxDist;
+		mat[(i + 1) * w + 1] = i;
+	}
+	for (let j = 0; j <= bLen; j++) {
+		mat[j + 1] = maxDist;
+		mat[w + j + 1] = j;
+	}
+	const lastSeen = /* @__PURE__ */ new Map();
+	for (let i = 0; i < aLen; i++) {
+		let db = 0;
+		const aChar = a.charCodeAt(i);
+		const i1 = i + 1;
+		for (let j = 0; j < bLen; j++) {
+			const j1 = j + 1;
+			const bChar = b.charCodeAt(j);
+			const last = lastSeen.get(bChar) ?? 0;
+			const subCost = aChar === bChar ? 0 : 1;
+			const base = (i1 + 1) * w + j1 + 1;
+			const sub = mat[i1 * w + j1] + subCost;
+			const del = mat[(i1 + 1) * w + j1] + 1;
+			const ins = mat[i1 * w + j1 + 1] + 1;
+			const trans = mat[last * w + db] + i1 + j1 - 2 + 1 - last - db;
+			mat[base] = Math.min(sub, del, ins, trans);
+			if (aChar === bChar) db = j1;
+		}
+		lastSeen.set(aChar, i1);
+	}
+	return mat[(aLen + 1) * w + bLen + 1];
+}
+/**
+* Compute the normalized Damerau-Levenshtein similarity in [0, 1].
+*
+* @param a - First string
+* @param b - Second string
+* @returns Similarity score where 1 means identical
+*/
+function damerauLevenshteinNormalized(a, b) {
+	return normalize(damerauLevenshtein(a, b), Math.max(a.length, b.length));
+}
+//#endregion
+//#region src/edit/hamming.ts
+/**
+* Hamming distance — counts character mismatches between equal-length strings.
+*
+* Time: O(min(m, n))
+*/
+/**
+* Compute the Hamming distance between two strings.
+*
+* If strings have different lengths, only compares up to the shorter length
+* and adds the length difference as additional mismatches.
+*
+* @param a - First string
+* @param b - Second string
+* @returns Number of mismatching characters
+*/
+function hamming(a, b) {
+	const minLen = Math.min(a.length, b.length);
+	let count = Math.abs(a.length - b.length);
+	for (let i = 0; i < minLen; i++) if (a.charCodeAt(i) !== b.charCodeAt(i)) count++;
+	return count;
+}
+/**
+* Compute the normalized Hamming similarity in [0, 1].
+*
+* @param a - First string
+* @param b - Second string
+* @returns Similarity score where 1 means identical
+*/
+function hammingNormalized(a, b) {
+	const maxLen = Math.max(a.length, b.length);
+	return maxLen === 0 ? 1 : 1 - hamming(a, b) / maxLen;
+}
+//#endregion
+//#region src/edit/lcs-str.ts
+/**
+* Longest Common Substring (contiguous) algorithms.
+*
+* Unlike LCS (subsequence), this requires the matching characters to be contiguous.
+*
+* Time: O(m * n), Space: O(min(m, n))
+*/
+/**
+* Compute the length of the Longest Common Substring.
+*
+* @param a - First string
+* @param b - Second string
+* @returns Length of the longest common substring
+*/
+function lcsSubstringLength(a, b) {
+	const aLen = a.length;
+	const bLen = b.length;
+	if (aLen === 0 || bLen === 0) return 0;
+	let maxLen = 0;
+	const dp = new Uint32Array(bLen + 1);
+	for (let i = 1; i <= aLen; i++) {
+		let prev = 0;
+		for (let j = 1; j <= bLen; j++) {
+			const temp = dp[j];
+			if (a.charCodeAt(i - 1) === b.charCodeAt(j - 1)) {
+				dp[j] = prev + 1;
+				if (dp[j] > maxLen) maxLen = dp[j];
+			} else dp[j] = 0;
+			prev = temp;
+		}
+	}
+	return maxLen;
+}
+/**
+* Compute the LCS substring distance: len(a) + len(b) - 2 * lcsSubstringLength.
+*
+* @param a - First string
+* @param b - Second string
+* @returns LCS substring distance (non-negative integer)
+*/
+function lcsSubstringDistance(a, b) {
+	return a.length + b.length - 2 * lcsSubstringLength(a, b);
+}
+/**
+* Compute the normalized LCS substring similarity in [0, 1].
+*
+* Normalized by max(len(a), len(b)) to match textdistance.rs convention.
+*
+* @param a - First string
+* @param b - Second string
+* @returns Similarity score where 1 means identical
+*/
+function lcsSubstringNormalized(a, b) {
+	const maxLen = Math.max(a.length, b.length);
+	if (maxLen === 0) return 1;
+	return lcsSubstringLength(a, b) / maxLen;
+}
+//#endregion
+//#region src/edit/sift4.ts
+/**
+* SIFT4 simple — fast approximate string distance.
+*
+* A fast algorithm for approximate string matching with O(n) complexity
+* in typical cases. Returns a distance value (lower = more similar).
+*
+* Matches the textdistance.rs implementation exactly.
+*
+* Time: O(n * maxOffset)
+*/
+/**
+* Compute the SIFT4 simple distance between two strings.
+*
+* @param a - First string
+* @param b - Second string
+* @param options - Configuration
+* @returns Distance (non-negative integer)
+*/
+function sift4(a, b, options = {}) {
+	const maxOffset = options.maxOffset ?? 5;
+	const aLen = a.length;
+	const bLen = b.length;
+	let c1 = 0;
+	let c2 = 0;
+	let lcss = 0;
+	let localCs = 0;
+	while (c1 < aLen && c2 < bLen) {
+		if (a.charCodeAt(c1) === b.charCodeAt(c2)) localCs++;
+		else {
+			lcss += localCs;
+			localCs = 0;
+			if (c1 !== c2) {
+				c1 = Math.min(c1, c2);
+				c2 = c1;
+			}
+			for (let offset = 0; offset < maxOffset; offset++) {
+				if (!(c1 + 1 < aLen || c2 + offset < bLen)) break;
+				if (c1 + offset < aLen && a.charCodeAt(c1 + offset) === b.charCodeAt(c2)) {
+					c1 += offset;
+					localCs++;
+					break;
+				}
+				if (c2 + offset < bLen && a.charCodeAt(c1) === b.charCodeAt(c2 + offset)) {
+					c2 += offset;
+					localCs++;
+					break;
+				}
+			}
+		}
+		c1++;
+		c2++;
+	}
+	return Math.max(aLen, bLen) - lcss - localCs;
+}
+/**
+* Compute the normalized SIFT4 similarity in [0, 1].
+*
+* @param a - First string
+* @param b - Second string
+* @param options - Configuration
+* @returns Similarity score where 1 means identical
+*/
+function sift4Normalized(a, b, options = {}) {
+	return normalize(sift4(a, b, options), Math.max(a.length, b.length));
+}
+//#endregion
+//#region src/edit/ratcliff.ts
+/**
+* Ratcliff-Obershelp algorithm — Gestalt pattern matching.
+*
+* Iteratively finds the longest common substring using a stack-based approach,
+* combining scores from both sides. Returns a similarity in [0, 1].
+*
+* Based on the textdistance.rs implementation.
+*
+* Time: O(n * m * log(n * m)) worst case, O(n + m) average
+*/
+/**
+* Internal: find the longest common substring and return its length and positions.
+*/
+function findLCS(a, b) {
+	const aLen = a.length;
+	const bLen = b.length;
+	if (aLen === 0 || bLen === 0) return {
+		len: 0,
+		aIdx: 0,
+		bIdx: 0
+	};
+	let maxLen = 0;
+	let endI = 0;
+	let endJ = 0;
+	const dp = new Uint32Array(bLen + 1);
+	for (let i = 1; i <= aLen; i++) {
+		let prev = 0;
+		for (let j = 1; j <= bLen; j++) {
+			const temp = dp[j];
+			if (a.charCodeAt(i - 1) === b.charCodeAt(j - 1)) {
+				dp[j] = prev + 1;
+				if (dp[j] > maxLen) {
+					maxLen = dp[j];
+					endI = i;
+					endJ = j;
+				}
+			} else dp[j] = 0;
+			prev = temp;
+		}
+	}
+	return {
+		len: maxLen,
+		aIdx: endI - maxLen,
+		bIdx: endJ - maxLen
+	};
+}
+/**
+* Compute Ratcliff-Obershelp similarity between two strings.
+*
+* Uses an iterative stack-based approach to avoid stack overflow on
+* very different strings. The algorithm recursively finds the longest
+* common substring and combines similarity scores from both sides.
+*
+* similarity = 2 * M / T, where M = total matched characters, T = total characters
+*
+* @param a - First string
+* @param b - Second string
+* @returns Ratcliff-Obershelp similarity in [0, 1]
+*/
+function ratcliff(a, b) {
+	if (a === b) return 1;
+	const totalLen = a.length + b.length;
+	if (totalLen === 0) return 1;
+	let totalMatch = 0;
+	const stack = [[
+		0,
+		a.length,
+		0,
+		b.length
+	]];
+	while (stack.length > 0) {
+		const [aStart, aEnd, bStart, bEnd] = stack.pop();
+		if (aEnd - aStart === 0 || bEnd - bStart === 0) continue;
+		const lcs = findLCS(a.slice(aStart, aEnd), b.slice(bStart, bEnd));
+		if (lcs.len === 0) continue;
+		totalMatch += lcs.len;
+		const aRightStart = aStart + lcs.aIdx + lcs.len;
+		const bRightStart = bStart + lcs.bIdx + lcs.len;
+		if (aEnd - aRightStart > 0 && bEnd - bRightStart > 0) stack.push([
+			aRightStart,
+			aEnd,
+			bRightStart,
+			bEnd
+		]);
+		const aLeftEnd = aStart + lcs.aIdx;
+		const bLeftEnd = bStart + lcs.bIdx;
+		if (aLeftEnd - aStart > 0 && bLeftEnd - bStart > 0) stack.push([
+			aStart,
+			aLeftEnd,
+			bStart,
+			bLeftEnd
+		]);
+	}
+	return 2 * totalMatch / totalLen;
+}
+//#endregion
+//#region src/edit/smith-waterman.ts
+/**
+* Compute the raw Smith-Waterman alignment score.
+*
+* @param a - First string
+* @param b - Second string
+* @param options - Scoring parameters
+* @returns Raw alignment score (non-negative)
+*/
+function smithWaterman(a, b, options = {}) {
+	const matchScore = options.matchScore ?? 1;
+	const mismatchScore = options.mismatchScore ?? 0;
+	const gapScore = options.gapScore ?? -1;
+	const aLen = a.length;
+	const bLen = b.length;
+	const w = bLen + 1;
+	const dp = new Int16Array((aLen + 1) * w);
+	dp.fill(0);
+	for (let i = 1; i <= aLen; i++) {
+		const rowBase = i * w;
+		const prevRowBase = (i - 1) * w;
+		for (let j = 1; j <= bLen; j++) {
+			const cost = a.charCodeAt(i - 1) === b.charCodeAt(j - 1) ? matchScore : mismatchScore;
+			const diag = dp[prevRowBase + j - 1] + cost;
+			const up = dp[prevRowBase + j] + gapScore;
+			const left = dp[rowBase + j - 1] + gapScore;
+			dp[rowBase + j] = Math.max(0, diag, up, left);
+		}
+	}
+	return dp[aLen * w + bLen];
+}
+/**
+* Compute the normalized Smith-Waterman similarity in [0, 1].
+*
+* Normalized by matchScore * max(len(a), len(b)), matching textdistance.rs convention.
+*
+* @param a - First string
+* @param b - Second string
+* @param options - Scoring parameters
+* @returns Normalized similarity in [0, 1]
+*/
+function smithWatermanNormalized(a, b, options = {}) {
+	const maxPossible = (options.matchScore ?? 1) * Math.max(a.length, b.length);
+	if (maxPossible === 0) return 1;
+	return smithWaterman(a, b, options) / maxPossible;
+}
+//#endregion
 //#region src/token/jaccard.ts
 const _freqA$2 = new Int32Array(128);
 const _freqB$2 = new Int32Array(128);
@@ -299,10 +762,10 @@ const _freqB$1 = new Int32Array(128);
 /**
 * Cosine similarity between two strings based on character-level multiset.
 *
-* cos(A, B) = (A · B) / (|A| * |B|)
+* Uses textdistance.rs convention:
+*   cosine(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
 *
-* Uses Counter (frequency map) for multiset semantics,
-* matching the textdistance crate behavior.
+* Where intersect_count = sum(min(freqA[c], freqB[c])) and count = sum of frequencies.
 *
 * Time: O(m + n)
 *
@@ -314,43 +777,49 @@ function cosine(a, b) {
 	_freqA$1.fill(0);
 	_freqB$1.fill(0);
 	if (buildCharFreqArray(_freqA$1, a) && buildCharFreqArray(_freqB$1, b)) {
-		let dot = 0;
-		let normA = 0;
-		let normB = 0;
+		let intersection = 0;
+		let totalA = 0;
+		let totalB = 0;
 		for (let i = 0; i < 128; i++) {
 			const va = _freqA$1[i];
 			const vb = _freqB$1[i];
-			dot += va * vb;
-			normA += va * va;
-			normB += vb * vb;
+			intersection += va < vb ? va : vb;
+			totalA += va;
+			totalB += vb;
 		}
-		const denominator = Math.sqrt(normA) * Math.sqrt(normB);
-		return denominator === 0 ? 1 : dot / denominator;
+		if (totalA === 0 && totalB === 0) return 1;
+		if (totalA === 0 || totalB === 0) return 0;
+		return intersection / Math.sqrt(totalA * totalB);
 	}
 	const freqAMap = charFrequencyMap(a);
 	const freqBMap = charFrequencyMap(b);
-	let dotProduct = 0;
-	let normA = 0;
-	let normB = 0;
-	const [smaller, larger] = freqAMap.size <= freqBMap.size ? [freqAMap, freqBMap] : [freqBMap, freqAMap];
-	for (const [char, countA] of smaller) {
-		const countB = larger.get(char) ?? 0;
-		dotProduct += countA * countB;
-		normA += countA * countA;
-	}
-	for (const [, count] of larger) normB += count * count;
-	if (freqAMap.size > freqBMap.size) {
-		const tmp = normA;
-		normA = normB;
-		normB = tmp;
+	const intersection = intersectCount$1(freqAMap, freqBMap);
+	const totalA = totalCount$1(freqAMap);
+	const totalB = totalCount$1(freqBMap);
+	if (totalA === 0 && totalB === 0) return 1;
+	if (totalA === 0 || totalB === 0) return 0;
+	return intersection / Math.sqrt(totalA * totalB);
+}
+function intersectCount$1(a, b) {
+	let count = 0;
+	const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
+	for (const [key, countA] of smaller) {
+		const countB = larger.get(key);
+		if (countB !== void 0) count += countA < countB ? countA : countB;
 	}
-	const denominator = Math.sqrt(normA) * Math.sqrt(normB);
-	if (denominator === 0) return 1;
-	return dotProduct / denominator;
+	return count;
+}
+function totalCount$1(map) {
+	let count = 0;
+	for (const c of map.values()) count += c;
+	return count;
 }
 /**
 * Cosine similarity based on character n-grams.
 *
+* Uses textdistance.rs convention (same as character-level cosine but on n-grams):
+*   cosine_ngram(A, B) = intersect_count(A, B) / sqrt(count(A) * count(B))
+*
 * @param a - First string
 * @param b - Second string
 * @param n - N-gram size (default: 2)
@@ -360,32 +829,21 @@ function cosineNgram(a, b, n = 2) {
 	const freqAInt = ngramFrequencyMap(a, n);
 	const freqBInt = ngramFrequencyMap(b, n);
 	if (freqAInt !== null && freqBInt !== null) {
-		let dotProduct = 0;
-		let normA = 0;
-		let normB = 0;
-		for (const [id, countA] of freqAInt) {
-			const countB = freqBInt.get(id) ?? 0;
-			dotProduct += countA * countB;
-			normA += countA * countA;
-		}
-		for (const [, count] of freqBInt) normB += count * count;
-		const denominator = Math.sqrt(normA) * Math.sqrt(normB);
-		return denominator === 0 ? 1 : dotProduct / denominator;
+		const intersection = intersectCountInt(freqAInt, freqBInt);
+		const totalA = totalCountInt(freqAInt);
+		const totalB = totalCountInt(freqBInt);
+		if (totalA === 0 && totalB === 0) return 1;
+		if (totalA === 0 || totalB === 0) return 0;
+		return intersection / Math.sqrt(totalA * totalB);
 	}
 	const freqA = frequencyMap(ngrams(a, n));
 	const freqB = frequencyMap(ngrams(b, n));
-	let dotProduct = 0;
-	let normA = 0;
-	let normB = 0;
-	for (const [token, countA] of freqA) {
-		const countB = freqB.get(token) ?? 0;
-		dotProduct += countA * countB;
-		normA += countA * countA;
-	}
-	for (const [, count] of freqB) normB += count * count;
-	const denominator = Math.sqrt(normA) * Math.sqrt(normB);
-	if (denominator === 0) return 1;
-	return dotProduct / denominator;
+	const intersection = intersectCount$1(freqA, freqB);
+	const totalA = totalCount$1(freqA);
+	const totalB = totalCount$1(freqB);
+	if (totalA === 0 && totalB === 0) return 1;
+	if (totalA === 0 || totalB === 0) return 0;
+	return intersection / Math.sqrt(totalA * totalB);
 }
 //#endregion
 //#region src/token/sorensen.ts
@@ -445,6 +903,122 @@ function sorensenNgram(a, b, n = 2) {
 	return 2 * ic / total;
 }
 //#endregion
+//#region src/token/tversky.ts
+/**
+* Tversky index — asymmetric set similarity measure.
+*
+* Reduces to Jaccard when alpha = beta = 1.
+* Reduces to Sorensen-Dice when alpha = beta = 0.5.
+*
+* Time: O(m + n)
+*/
+/**
+* Compute the Tversky index between two strings based on character multiset.
+*
+* T(A, B; α, β) = |A ∩ B| / (|A ∩ B| + α|A \ B| + β|B \ A|)
+*
+* @param a - First string
+* @param b - Second string
+* @param options - alpha and beta weights
+* @returns Tversky index in [0, 1]
+*/
+function tversky(a, b, options = {}) {
+	const alpha = options.alpha ?? 1;
+	const beta = options.beta ?? 1;
+	const freqA = charFrequencyMap(a);
+	const freqB = charFrequencyMap(b);
+	const intersection = intersectCount(freqA, freqB);
+	const totalA = totalCount(freqA);
+	const totalB = totalCount(freqB);
+	const onlyA = totalA - intersection;
+	const onlyB = totalB - intersection;
+	const denominator = intersection + alpha * onlyA + beta * onlyB;
+	if (denominator === 0) return 1;
+	return intersection / denominator;
+}
+//#endregion
+//#region src/token/overlap.ts
+/**
+* Overlap coefficient — set similarity normalized by the smaller set.
+*
+* overlap(A, B) = |A ∩ B| / min(|A|, |B|)
+*
+* Time: O(m + n)
+*/
+/**
+* Compute the overlap coefficient between two strings based on character multiset.
+*
+* @param a - First string
+* @param b - Second string
+* @returns Overlap coefficient in [0, 1]
+*/
+function overlap(a, b) {
+	const freqA = charFrequencyMap(a);
+	const freqB = charFrequencyMap(b);
+	const intersection = intersectCount(freqA, freqB);
+	const totalA = totalCount(freqA);
+	const totalB = totalCount(freqB);
+	const minTotal = Math.min(totalA, totalB);
+	if (totalA === 0 && totalB === 0) return 1;
+	if (totalA === 0 || totalB === 0) return 0;
+	return intersection / minTotal;
+}
+//#endregion
+//#region src/token/naive.ts
+/**
+* Naive string similarity measures: prefix, suffix, length.
+*
+* Time: O(min(m, n)) for prefix/suffix, O(1) for length
+*/
+/**
+* Compute prefix similarity between two strings.
+*
+* prefix(a, b) = commonPrefixLength / max(|a|, |b|)
+*
+* @param a - First string
+* @param b - Second string
+* @returns Prefix similarity in [0, 1]
+*/
+function prefix(a, b) {
+	const maxLen = Math.max(a.length, b.length);
+	if (maxLen === 0) return 1;
+	let commonLen = 0;
+	while (commonLen < a.length && commonLen < b.length && a.charCodeAt(commonLen) === b.charCodeAt(commonLen)) commonLen++;
+	return commonLen / maxLen;
+}
+/**
+* Compute suffix similarity between two strings.
+*
+* suffix(a, b) = commonSuffixLength / max(|a|, |b|)
+*
+* @param a - First string
+* @param b - Second string
+* @returns Suffix similarity in [0, 1]
+*/
+function suffix(a, b) {
+	const maxLen = Math.max(a.length, b.length);
+	if (maxLen === 0) return 1;
+	let commonLen = 0;
+	const aEnd = a.length;
+	const bEnd = b.length;
+	while (commonLen < aEnd && commonLen < bEnd && a.charCodeAt(aEnd - 1 - commonLen) === b.charCodeAt(bEnd - 1 - commonLen)) commonLen++;
+	return commonLen / maxLen;
+}
+/**
+* Compute length-based similarity between two strings.
+*
+* length(a, b) = 1 - |len(a) - len(b)| / max(len(a), len(b))
+*
+* @param a - First string
+* @param b - Second string
+* @returns Normalized length similarity in [0, 1]
+*/
+function length(a, b) {
+	const maxLen = Math.max(a.length, b.length);
+	if (maxLen === 0) return 1;
+	return 1 - Math.abs(a.length - b.length) / maxLen;
+}
+//#endregion
 //#region src/hash/simhash.ts
 /**
 * Generate a 64-bit fingerprint for a collection of features.
@@ -1131,4 +1705,4 @@ function findBestMatch(query, collection, options = {}) {
 	return results.length > 0 ? results[0] : null;
 }
 //#endregion
-export { DiffType, FuzzySearch, LSH, MinHash, SimHasher, cosine, cosineNgram, diff, findBestMatch, hammingDistance, hammingSimilarity, jaccard, jaccardNgram, lcsDistance, lcsLength, lcsNormalized, lcsPairs, levenshtein, levenshteinNormalized, simhash, sorensen, sorensenNgram, stringEquals };
+export { CHAR_FREQ_SIZE, DiffType, FuzzySearch, LSH, MinHash, SimHasher, buildCharFreqArray, charFrequencyMap, combineHash, cosine, cosineNgram, damerauLevenshtein, damerauLevenshteinNormalized, diff, findBestMatch, fnv1a, frequencyMap, hamming, hammingDistance, hammingNormalized, hammingSimilarity, intersectCount, intersectCountInt, jaccard, jaccardNgram, jaro, jaroWinkler, lcsDistance, lcsLength, lcsNormalized, lcsPairs, lcsSubstringDistance, lcsSubstringLength, lcsSubstringNormalized, length, levenshtein, levenshteinNormalized, ngramFrequencyMap, ngrams, normalize, overlap, prefix, ratcliff, sift4, sift4Normalized, simhash, smithWaterman, smithWatermanNormalized, sorensen, sorensenNgram, stringEquals, suffix, totalCount, totalCountInt, tversky, unionCount, unionCountInt };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@nlptools/distance",
-  "version": "0.0.4",
+  "version": "0.0.5",
   "description": "Complete string distance and similarity algorithms package with WebAssembly and JavaScript implementations",
   "keywords": [
     "algorithms",