npm - @thi.ng/text-analysis - Versions diffs - 0.1.0 - Mend

@thi.ng/text-analysis 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/similarity.d.ts ADDED Viewed

@@ -0,0 +1,73 @@
+/**
+ * Computes cosine similarity of the given dense multi-hot vectors.
+ *
+ * @remarks
+ * Re-export of [`distCosine()` in
+ * thi.ng/vectors](https://docs.thi.ng/umbrella/vectors/functions/distCosine.html)
+ *
+ * @param a
+ * @param b
+ */
+export declare const cosineSimilarityDense: import("@thi.ng/vectors/api").DistanceFn;
+/**
+ * Computes cosine similarity of given sparse multi-hot vectors.
+ *
+ * @param a
+ * @param b
+ */
+export declare const cosineSimilaritySparse: (a: ArrayLike<number>, b: ArrayLike<number>) => number;
+/**
+ * Computes Jaccard similarity of given dense multi-hot vectors.
+ *
+ * @remarks
+ * Re-export of [`jaccardSimilarity()` in
+ * thi.ng/vectors](https://docs.thi.ng/umbrella/vectors/functions/jaccardSimilarity.html)
+ *
+ * @param a
+ * @param b
+ */
+export declare const jaccardSimilarityDense: import("@thi.ng/vectors/api").DistanceFn;
+/**
+ * Computes Jaccard similarity of given sparse multi-hot vectors.
+ *
+ * @param a
+ * @param b
+ */
+export declare const jaccardSimilaritySparse: (a: ArrayLike<number>, b: ArrayLike<number>) => number;
+/**
+ * Computes dot product of the given dense vectors.
+ *
+ * @remarks
+ * Re-export of [`dot()` in
+ * thi.ng/vectors](https://docs.thi.ng/umbrella/vectors/functions/dot.html)
+ *
+ * @param a
+ * @param b
+ */
+export declare const dotProductDense: import("@thi.ng/vectors/api").MultiVecOpRoVV<number>;
+/**
+ * Computes dot product of the given sparse multi-hot vectors.
+ *
+ * @param a
+ * @param b
+ */
+export declare const dotProductSparse: (a: ArrayLike<number>, b: ArrayLike<number>) => number;
+/**
+ * Computes the squared L2 distance of the given dense vectors.
+ *
+ * @remarks
+ * Re-export of [`dot()` in
+ * thi.ng/vectors](https://docs.thi.ng/umbrella/vectors/functions/dot.html)
+ *
+ * @param a
+ * @param b
+ */
+export declare const distSqDense: import("@thi.ng/vectors/api").MultiVecOpRoVV<number>;
+/**
+ * Computes the squared L2 distance of the given sparse multi-hot vectors.
+ *
+ * @param a
+ * @param b
+ */
+export declare const distSqSparse: (a: ArrayLike<number>, b: ArrayLike<number>) => number;
+//# sourceMappingURL=similarity.d.ts.map

package/similarity.js ADDED Viewed

@@ -0,0 +1,75 @@
+import { distCosine } from "@thi.ng/vectors/dist-cosine";
+import { jaccardSimilarity as $jaccard } from "@thi.ng/vectors/dist-jaccard";
+import { distSq as $distSq } from "@thi.ng/vectors/distsq";
+import { dot as $dot } from "@thi.ng/vectors/dot";
+const cosineSimilarityDense = distCosine;
+const cosineSimilaritySparse = (a, b) => {
+  const dot = dotProductSparse(a, b);
+  return dot > 0 ? dot / (Math.sqrt(a.length) * Math.sqrt(b.length)) : 0;
+};
+const jaccardSimilarityDense = $jaccard;
+const jaccardSimilaritySparse = (a, b) => {
+  const na = a.length;
+  const nb = b.length;
+  let numIsec = 0;
+  let apos, bpos;
+  for (let i = 0, j = 0; i < na && j < nb; ) {
+    apos = a[i];
+    bpos = b[j];
+    if (apos === bpos) {
+      numIsec++;
+      i++;
+      j++;
+    } else if (apos < bpos) {
+      i++;
+    } else {
+      j++;
+    }
+  }
+  const numUnion = na + nb - numIsec;
+  return numUnion > 0 ? numIsec / numUnion : 0;
+};
+const dotProductDense = $dot;
+const dotProductSparse = (a, b) => {
+  let res = 0;
+  let apos, bpos;
+  for (let i = 0, j = 0, na = a.length, nb = b.length; i < na && j < nb; ) {
+    apos = a[i];
+    bpos = b[j];
+    if (apos === bpos) {
+      res++;
+      i++;
+      j++;
+    } else if (apos < bpos) i++;
+    else j++;
+  }
+  return res;
+};
+const distSqDense = $distSq;
+const distSqSparse = (a, b) => {
+  let res = 0;
+  let apos, bpos;
+  for (let i = 0, j = 0, na = a.length, nb = b.length; i < na && j < nb; ) {
+    apos = a[i];
+    bpos = b[j];
+    if (apos === bpos) {
+      i++;
+      j++;
+    } else {
+      res++;
+      if (apos < bpos) i++;
+      else j++;
+    }
+  }
+  return res;
+};
+export {
+  cosineSimilarityDense,
+  cosineSimilaritySparse,
+  distSqDense,
+  distSqSparse,
+  dotProductDense,
+  dotProductSparse,
+  jaccardSimilarityDense,
+  jaccardSimilaritySparse
+};

package/stem.d.ts ADDED Viewed

@@ -0,0 +1,15 @@
+/**
+ * Porter stemmer for a single English word.
+ *
+ * @remarks
+ * Based on Porter stemmer in Javascript (various refactoring/optimizations):
+ *
+ * http://www.tartarus.org/~martin/PorterStemmer
+ *
+ * Original paper by Porter, 1980: "An algorithm for suffix
+ * stripping", Program, Vol. 14, no. 3, pp 130-137
+ *
+ * @param word
+ */
+export declare const stemWord: (word: string) => string;
+//# sourceMappingURL=stem.d.ts.map

package/stem.js ADDED Viewed

@@ -0,0 +1,110 @@
+const SUFFIXES_STEP2 = {
+  alism: "al",
+  aliti: "al",
+  alli: "al",
+  anci: "ance",
+  ation: "ate",
+  ational: "ate",
+  ator: "ate",
+  biliti: "ble",
+  bli: "ble",
+  eli: "e",
+  enci: "ence",
+  entli: "ent",
+  fulness: "ful",
+  iveness: "ive",
+  iviti: "ive",
+  ization: "ize",
+  izer: "ize",
+  logi: "log",
+  ousli: "ous",
+  ousness: "ous",
+  tional: "tion"
+};
+const SUFFIXES_STEP3 = {
+  alize: "al",
+  ative: "",
+  ful: "",
+  ical: "ic",
+  icate: "ic",
+  iciti: "ic",
+  ness: ""
+};
+const c = "[^aeiou]";
+const v = "[aeiouy]";
+const C = c + "[^aeiouy]*";
+const V = v + "[aeiou]*";
+const RE_MGR0 = new RegExp("^(" + C + ")?" + V + C);
+const RE_MEQ1 = new RegExp("^(" + C + ")?" + V + C + "(" + V + ")?$");
+const RE_MGR1 = new RegExp("^(" + C + ")?" + V + C + V + C);
+const RE_STEM_VOWEL = new RegExp("^(" + C + ")?" + v);
+const OTHER = new RegExp("^" + C + v + "[^aeiouwxy]$");
+const RE_STEP1A = /^(.+?)(ss|i)es$/;
+const RE_STEP1A_ALT = /^(.+?)([^s])s$/;
+const RE_STEP1B = /^(.+?)eed$/;
+const RE_STEP1B_ALT = /^(.+?)(ed|ing)$/;
+const RE_STEP1B_2 = /(at|bl|iz)$/;
+const RE_STEP1B_3 = /([^aeiouylsz])\1$/;
+const RE_STEP1B_4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+const RE_STEP1C = /^(.+?)y$/;
+const RE_STEP2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+const RE_STEP3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+const RE_STEP4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+const RE_STEP4A = /^(.+?)(s|t)(ion)$/;
+const RE_STEP5 = /^(.+?)e$/;
+const stemWord = (word) => {
+  let initialY;
+  let fp;
+  let stem;
+  let suffix;
+  if (word.length < 3) return word;
+  initialY = word[0] === "y";
+  if (initialY) word = "Y" + word.substring(1);
+  word = word.replace(RE_STEP1A, "$1$2").replace(RE_STEP1A_ALT, "$1$2");
+  if (fp = RE_STEP1B.exec(word)) {
+    if (RE_MGR0.test(fp[1])) word = word.slice(0, -1);
+  } else if (fp = RE_STEP1B_ALT.exec(word)) {
+    stem = fp[1];
+    if (RE_STEM_VOWEL.test(stem)) {
+      word = stem;
+      if (RE_STEP1B_2.test(word)) {
+        word = word + "e";
+      } else if (RE_STEP1B_3.test(word)) {
+        word = word.slice(0, -1);
+      } else if (RE_STEP1B_4.test(word)) {
+        word = word + "e";
+      }
+    }
+  }
+  if (fp = RE_STEP1C.exec(word)) {
+    stem = fp[1];
+    if (RE_STEM_VOWEL.test(stem)) word = stem + "i";
+  }
+  if (fp = RE_STEP2.exec(word)) {
+    stem = fp[1];
+    suffix = fp[2];
+    if (RE_MGR0.test(stem)) word = stem + SUFFIXES_STEP2[suffix];
+  }
+  if (fp = RE_STEP3.exec(word)) {
+    stem = fp[1];
+    suffix = fp[2];
+    if (RE_MGR0.test(stem)) word = stem + SUFFIXES_STEP3[suffix];
+  }
+  if (fp = RE_STEP4.exec(word)) {
+    stem = fp[1];
+    if (RE_MGR1.test(stem)) word = stem;
+  } else if (fp = RE_STEP4A.exec(word)) {
+    stem = fp[1] + fp[2];
+    if (RE_MGR1.test(stem)) word = stem;
+  }
+  if (fp = RE_STEP5.exec(word)) {
+    stem = fp[1];
+    if (RE_MGR1.test(stem) || RE_MEQ1.test(stem) && !OTHER.test(stem))
+      word = stem;
+  }
+  if (word.endsWith("ll") && RE_MGR1.test(word)) word = word.slice(0, -1);
+  return initialY ? "y" + word.substring(1) : word;
+};
+export {
+  stemWord
+};

package/stop-words.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export declare const DEFAULT_STOP_WORDS_EN: Set<string>;
+export declare const removeStopWords: (words?: Set<string>) => import("@thi.ng/transducers").Transducer<string, string>;
+//# sourceMappingURL=stop-words.d.ts.map

package/stop-words.js ADDED Viewed

@@ -0,0 +1,105 @@
+import { filter } from "@thi.ng/transducers/filter";
+const DEFAULT_STOP_WORDS_EN = /* @__PURE__ */ new Set([
+  "a",
+  "above",
+  "across",
+  "after",
+  "against",
+  "along",
+  "also",
+  "among",
+  "an",
+  "and",
+  "any",
+  "are",
+  "around",
+  "at",
+  "be",
+  "been",
+  "before",
+  "being",
+  "below",
+  "beneath",
+  "between",
+  "beyond",
+  "both",
+  "but",
+  "by",
+  "could",
+  "despite",
+  "do",
+  "does",
+  "down",
+  "during",
+  "each",
+  "eg",
+  "etc",
+  "even",
+  "every",
+  "few",
+  "for",
+  "from",
+  "he",
+  "i",
+  "ie",
+  "in",
+  "inside",
+  "into",
+  "is",
+  "it",
+  "just",
+  "less",
+  "many",
+  "may",
+  "might",
+  "more",
+  "much",
+  "must",
+  "of",
+  "on",
+  "one",
+  "onto",
+  "or",
+  "out",
+  "outside",
+  "over",
+  "over",
+  "quite",
+  "really",
+  "several",
+  "she",
+  "should",
+  "since",
+  "so",
+  "some",
+  "such",
+  "that",
+  "the",
+  "these",
+  "they",
+  "this",
+  "those",
+  "three",
+  "through",
+  "to",
+  "too",
+  "toward",
+  "two",
+  "under",
+  "until",
+  "up",
+  "very",
+  "was",
+  "we",
+  "were",
+  "with",
+  "within",
+  "without",
+  "would",
+  "you"
+]);
+const removeStopWords = (words = DEFAULT_STOP_WORDS_EN) => filter((x) => !words.has(x));
+export {
+  DEFAULT_STOP_WORDS_EN,
+  removeStopWords
+};

package/tokenize.d.ts ADDED Viewed

@@ -0,0 +1,51 @@
+import type { Transducer } from "@thi.ng/transducers";
+/**
+ * Customizable string tokenizer with optional transducer-based token
+ * transformation(s). Yields an iterator of tokens.
+ *
+ * @remarks
+ * The package provides a number of composable string transducers which can be
+ * listed here and will be applied in sequence for each input token.
+ *
+ * @example
+ * ```ts tangle:../export/tokenize.ts
+ * import * as ta from "@thi.ng/text-analysis";
+ *
+ * const input = `Do not go gentle into that good night,
+ *   Old age should burn and rave at close of day;
+ *   Rage, rage against the dying of the light.
+ *
+ *   Though wise men at their end know dark is right,
+ *   Because their words had forked no lightning they
+ *   Do not go gentle into that good night.`;
+ *
+ * // tokenize input with given token transforms
+ * // collect tokens into array
+ * const tokens = [...ta.tokenize(
+ *   input,
+ *   [
+ *     ta.lowercase,
+ *     ta.removeNonAlphaNum,
+ *     ta.removeStopWords()
+ *   ]
+ * )];
+ *
+ * console.log(tokens);
+ * // [
+ * //   "do", "not", "go", "gentle", "good", "night", "old", "age",
+ * //   "burn", "rave", "close", "day", "rage", "rage", "dying", "light",
+ * //   ...
+ * // ]
+ *
+ * console.log(
+ *   [...ta.tokenize(input, [ta.ngrams(2)])]
+ * );
+ * ```
+ *
+ * @param src
+ * @param xforms
+ * @param delim
+ * @param includeDelim
+ */
+export declare const tokenize: (src: string, xforms?: Transducer<string, string>[], delim?: RegExp | string, includeDelim?: boolean) => IterableIterator<string>;
+//# sourceMappingURL=tokenize.d.ts.map

package/tokenize.js ADDED Viewed

@@ -0,0 +1,10 @@
+import { split } from "@thi.ng/strings";
+import { comp } from "@thi.ng/transducers/comp";
+import { iterator } from "@thi.ng/transducers/iterator";
+const tokenize = (src, xforms, delim = /[ \t\n\r,;:/?!()\[\]]+/g, includeDelim = false) => {
+  const $src = split(src, delim, includeDelim);
+  return xforms ? iterator(comp(...xforms), $src) : $src;
+};
+export {
+  tokenize
+};

package/vocab.d.ts ADDED Viewed

@@ -0,0 +1,166 @@
+import { type BidirIndex, type SerializedBidirIndex } from "@thi.ng/bidir-index";
+export type Vocab = BidirIndex<string>;
+export type SerializedVocab = SerializedBidirIndex<string>;
+/**
+ * Creates a bi-directional index storing unique tokens from given `src`
+ * iterable, optionally using custom `start` ID offset (default: 0). This index
+ * can then be used with {@link vectorize}, {@link vectorizeSparse}.
+ *
+ * @remarks
+ * This function is syntax sugar for
+ * [thi.ng/bidir-index](https://thi.ng/bidir-index).
+ *
+ * The vocab/index can be serialized to JSON and then re-created via
+ * `defVocab()`.
+ *
+ * @example
+ * ```ts tangle:../export/def-vocab.ts
+ * import { defVocab, tokenize } from "@thi.ng/text-analysis";
+ *
+ * const vocab = defVocab(
+ *   tokenize("the quick brown fox jumps over the lazy dog")
+ * );
+ *
+ * console.log([...vocab.entries()]);
+ * // [
+ * //   [ "the", 0 ],
+ * //   [ "quick", 1 ],
+ * //   [ "brown", 2 ],
+ * //   [ "fox", 3 ],
+ * //   [ "jumps", 4 ],
+ * //   [ "over", 5 ],
+ * //   [ "lazy", 6 ],
+ * //   [ "dog", 7 ]
+ * // ]
+ *
+ * console.log(vocab.get("fox"))
+ * // 3
+ *
+ * console.log(vocab.getID(3))
+ * // "fox"
+ * ```
+ *
+ * @param src
+ * @param start
+ */
+export declare function defVocab(src: Iterable<string>, start?: number): Vocab;
+/**
+ * (Re)creates bi-directional vocab index from previous serialized state (e.g.
+ * via `vocab.toJSON()`).
+ *
+ * @param vocab
+ */
+export declare function defVocab(vocab: SerializedVocab): Vocab;
+/**
+ * Encodes the given `src` tokens into a dense multi-hot vector using provided
+ * `vocab` (created via {@link defVocab}). The vector size is the number of
+ * items in the vocab.
+ *
+ * @remarks
+ * Also see {@link encodeSparse}.
+ *
+ * @example
+ * ```ts tangle:../export/encode-dense.ts
+ * import { defVocab, encodeDense, tokenize } from "@thi.ng/text-analysis";
+ *
+ * const vocab = defVocab(
+ *   tokenize("the quick brown fox jumps over the lazy dog")
+ * );
+ *
+ * console.log(encodeDense(vocab, tokenize("the brown dog jumps")));
+ * // [ 1, 0, 1, 0, 1, 0, 0, 1 ]
+ *
+ * console.log(encodeDense(vocab, tokenize("the lazy fox")));
+ * // [ 1, 0, 0, 1, 0, 0, 1, 0 ]
+ * ```
+ *
+ * @param vocab
+ * @param src
+ */
+export declare const encodeDense: (vocab: Vocab, src: Iterable<string>) => any[];
+/**
+ * Encodes the given `src` tokens into a sparse vector using provided `vocab`
+ * (created via {@link defVocab}). Only the IDs of matched tokens are stored.
+ * The returned vector size depends on the number of used/matched tokens, at
+ * most `vocab.size` (if entire vocab is used by `src`).
+ *
+ * @remarks
+ * Also see {@link encodeDense} for alternative encoding.
+ *
+ * @example
+ * ```ts tangle:../export/encode-sparse.ts
+ * import { defVocab, encodeSparse, tokenize } from "@thi.ng/text-analysis";
+ *
+ * const vocab = defVocab(
+ *   tokenize("the quick brown fox jumps over the lazy dog")
+ * );
+ *
+ * console.log(encodeSparse(vocab, tokenize("the brown dog jumps")));
+ * // [ 0, 2, 4, 7 ]
+ *
+ * console.log(encodeSparse(vocab, tokenize("the lazy fox")));
+ * // [ 0, 3, 6 ]
+ * ```
+ *
+ * @param vocab
+ * @param src
+ */
+export declare const encodeSparse: (vocab: Vocab, src: Iterable<string>) => number[];
+/**
+ * Reverse op of {@link encodeDense}. Decodes dense multi-hot vector to extract
+ * tokens from provided `vocab` (created via {@link defVocab}). The returned
+ * array only contains the corresponding tokens of the vector's non-zero
+ * components.
+ *
+ * @remarks
+ * Also see {@link decodeSparse}.
+ *
+ * @example
+ * ```ts tangle:../export/decode-dense.ts
+ * import { defVocab, decodeDense, tokenize } from "@thi.ng/text-analysis";
+ *
+ * const vocab = defVocab(
+ *   tokenize("the quick brown fox jumps over the lazy dog")
+ * );
+ *
+ * console.log(decodeDense(vocab, [1, 0, 1, 0, 1, 0, 0, 1]));
+ * // [ "the", "brown", "jumps", "dog" ]
+ *
+ * console.log(decodeDense(vocab, [1, 0, 0, 1, 0, 0, 1, 0]));
+ * // [ "the", "fox", "lazy" ]
+ * ```
+ *
+ * @param vocab
+ * @param src
+ * @param sort
+ */
+export declare const decodeDense: (vocab: Vocab, vec: Iterable<number>) => string[];
+/**
+ * Reverse op of {@link encodeSparse}. Decodes sparse vector (created via
+ * {@link encodeSparse} to extract tokens from provided `vocab` (created via
+ * {@link defVocab}).
+ *
+ * @remarks
+ * Also see {@link decodeDense}.
+ *
+ * @example
+ * ```ts tangle:../export/decode-sparse.ts
+ * import { defVocab, decodeSparse, tokenize } from "@thi.ng/text-analysis";
+ *
+ * const vocab = defVocab(
+ *   tokenize("the quick brown fox jumps over the lazy dog")
+ * );
+ *
+ * console.log(decodeSparse(vocab, [0, 2, 4, 7]));
+ * // [ "the", "brown", "jumps", "dog" ]
+ *
+ * console.log(decodeSparse(vocab, [0, 3, 6]));
+ * // [ "the", "fox", "lazy" ]
+ * ```
+ *
+ * @param vocab
+ * @param src
+ * @param sort
+ */
+export declare const decodeSparse: (vocab: Vocab, vec: Iterable<number>) => string[];
+//# sourceMappingURL=vocab.d.ts.map

package/vocab.js ADDED Viewed

@@ -0,0 +1,31 @@
+import {
+  bidirIndexFromJSON,
+  defBidirIndex
+} from "@thi.ng/bidir-index";
+import { isIterable } from "@thi.ng/checks/is-iterable";
+function defVocab(src, start) {
+  return isIterable(src) ? defBidirIndex(src, { start }) : bidirIndexFromJSON(src);
+}
+const encodeDense = (vocab, src) => {
+  const vec = new Array(vocab.size).fill(0);
+  for (let i of vocab.getAll(src)) vec[i] = 1;
+  return vec;
+};
+const encodeSparse = (vocab, src) => [...vocab.getAllUnique(src)].sort((a, b) => a - b);
+const decodeDense = (vocab, vec) => {
+  const res = [];
+  let i = 0;
+  for (let x of vec) {
+    if (x) res.push(vocab.getID(i));
+    i++;
+  }
+  return res;
+};
+const decodeSparse = (vocab, vec) => vocab.getAllIDs(vec);
+export {
+  decodeDense,
+  decodeSparse,
+  defVocab,
+  encodeDense,
+  encodeSparse
+};

package/xform.d.ts ADDED Viewed

@@ -0,0 +1,42 @@
+import type { Vocab } from "./vocab.js";
+/**
+ * Trnasducer to produce lowercase string.
+ */
+export declare const lowercase: import("@thi.ng/transducers").Transducer<string, string>;
+/**
+ * Trnasducer which collapses multiple whitespace chars into one.
+ */
+export declare const collapseWS: import("@thi.ng/transducers").Transducer<string, string>;
+/**
+ * Transducer which removes empty or whitespace-only strings/tokens.
+ */
+export declare const removeEmpty: import("@thi.ng/transducers").Transducer<string, string>;
+/**
+ * Transducer which removes non-alphabetic chars from input, using
+ * {@link RE_NON_ALPHA}.
+ */
+export declare const removeNonAlpha: import("@thi.ng/transducers").Transducer<string, string>;
+/**
+ * Transducer which removes non-alphabetic chars from input, using
+ * {@link RE_NON_ALPHANUM}.
+ */
+export declare const removeNonAlphaNum: import("@thi.ng/transducers").Transducer<string, string>;
+/**
+ * Transducer which removes tokens with their length outside the configured
+ * `[min,max]` range.
+ *
+ * @param min
+ * @param max
+ */
+export declare const minMaxLength: (min: number, max: number) => import("@thi.ng/transducers").Transducer<string, string>;
+/**
+ * Transducer version of {@link stemWord}.
+ */
+export declare const stemOnly: import("@thi.ng/transducers").Transducer<string, string>;
+/**
+ * Transducer which removes tokens which are not present in given `vocab`.
+ *
+ * @param vocab
+ */
+export declare const vocabOnly: (vocab: Vocab) => import("@thi.ng/transducers").Transducer<string, string>;
+//# sourceMappingURL=xform.d.ts.map