@thi.ng/text-analysis 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ /**
2
+ * Computes cosine similarity of the given dense multi-hot vectors.
3
+ *
4
+ * @remarks
5
+ * Re-export of [`distCosine()` in
6
+ * thi.ng/vectors](https://docs.thi.ng/umbrella/vectors/functions/distCosine.html)
7
+ *
8
+ * @param a
9
+ * @param b
10
+ */
11
+ export declare const cosineSimilarityDense: import("@thi.ng/vectors/api").DistanceFn;
12
+ /**
13
+ * Computes cosine similarity of given sparse multi-hot vectors.
14
+ *
15
+ * @param a
16
+ * @param b
17
+ */
18
+ export declare const cosineSimilaritySparse: (a: ArrayLike<number>, b: ArrayLike<number>) => number;
19
+ /**
20
+ * Computes Jaccard similarity of given dense multi-hot vectors.
21
+ *
22
+ * @remarks
23
+ * Re-export of [`jaccardSimilarity()` in
24
+ * thi.ng/vectors](https://docs.thi.ng/umbrella/vectors/functions/jaccardSimilarity.html)
25
+ *
26
+ * @param a
27
+ * @param b
28
+ */
29
+ export declare const jaccardSimilarityDense: import("@thi.ng/vectors/api").DistanceFn;
30
+ /**
31
+ * Computes Jaccard similarity of given sparse multi-hot vectors.
32
+ *
33
+ * @param a
34
+ * @param b
35
+ */
36
+ export declare const jaccardSimilaritySparse: (a: ArrayLike<number>, b: ArrayLike<number>) => number;
37
+ /**
38
+ * Computes dot product of the given dense vectors.
39
+ *
40
+ * @remarks
41
+ * Re-export of [`dot()` in
42
+ * thi.ng/vectors](https://docs.thi.ng/umbrella/vectors/functions/dot.html)
43
+ *
44
+ * @param a
45
+ * @param b
46
+ */
47
+ export declare const dotProductDense: import("@thi.ng/vectors/api").MultiVecOpRoVV<number>;
48
+ /**
49
+ * Computes dot product of the given sparse multi-hot vectors.
50
+ *
51
+ * @param a
52
+ * @param b
53
+ */
54
+ export declare const dotProductSparse: (a: ArrayLike<number>, b: ArrayLike<number>) => number;
55
+ /**
56
+ * Computes the squared L2 distance of the given dense vectors.
57
+ *
58
+ * @remarks
59
+ * Re-export of [`dot()` in
60
+ * thi.ng/vectors](https://docs.thi.ng/umbrella/vectors/functions/dot.html)
61
+ *
62
+ * @param a
63
+ * @param b
64
+ */
65
+ export declare const distSqDense: import("@thi.ng/vectors/api").MultiVecOpRoVV<number>;
66
+ /**
67
+ * Computes the squared L2 distance of the given sparse multi-hot vectors.
68
+ *
69
+ * @param a
70
+ * @param b
71
+ */
72
+ export declare const distSqSparse: (a: ArrayLike<number>, b: ArrayLike<number>) => number;
73
+ //# sourceMappingURL=similarity.d.ts.map
package/similarity.js ADDED
@@ -0,0 +1,75 @@
1
+ import { distCosine } from "@thi.ng/vectors/dist-cosine";
2
+ import { jaccardSimilarity as $jaccard } from "@thi.ng/vectors/dist-jaccard";
3
+ import { distSq as $distSq } from "@thi.ng/vectors/distsq";
4
+ import { dot as $dot } from "@thi.ng/vectors/dot";
5
+ const cosineSimilarityDense = distCosine;
6
+ const cosineSimilaritySparse = (a, b) => {
7
+ const dot = dotProductSparse(a, b);
8
+ return dot > 0 ? dot / (Math.sqrt(a.length) * Math.sqrt(b.length)) : 0;
9
+ };
10
+ const jaccardSimilarityDense = $jaccard;
11
+ const jaccardSimilaritySparse = (a, b) => {
12
+ const na = a.length;
13
+ const nb = b.length;
14
+ let numIsec = 0;
15
+ let apos, bpos;
16
+ for (let i = 0, j = 0; i < na && j < nb; ) {
17
+ apos = a[i];
18
+ bpos = b[j];
19
+ if (apos === bpos) {
20
+ numIsec++;
21
+ i++;
22
+ j++;
23
+ } else if (apos < bpos) {
24
+ i++;
25
+ } else {
26
+ j++;
27
+ }
28
+ }
29
+ const numUnion = na + nb - numIsec;
30
+ return numUnion > 0 ? numIsec / numUnion : 0;
31
+ };
32
+ const dotProductDense = $dot;
33
+ const dotProductSparse = (a, b) => {
34
+ let res = 0;
35
+ let apos, bpos;
36
+ for (let i = 0, j = 0, na = a.length, nb = b.length; i < na && j < nb; ) {
37
+ apos = a[i];
38
+ bpos = b[j];
39
+ if (apos === bpos) {
40
+ res++;
41
+ i++;
42
+ j++;
43
+ } else if (apos < bpos) i++;
44
+ else j++;
45
+ }
46
+ return res;
47
+ };
48
+ const distSqDense = $distSq;
49
+ const distSqSparse = (a, b) => {
50
+ let res = 0;
51
+ let apos, bpos;
52
+ for (let i = 0, j = 0, na = a.length, nb = b.length; i < na && j < nb; ) {
53
+ apos = a[i];
54
+ bpos = b[j];
55
+ if (apos === bpos) {
56
+ i++;
57
+ j++;
58
+ } else {
59
+ res++;
60
+ if (apos < bpos) i++;
61
+ else j++;
62
+ }
63
+ }
64
+ return res;
65
+ };
66
+ export {
67
+ cosineSimilarityDense,
68
+ cosineSimilaritySparse,
69
+ distSqDense,
70
+ distSqSparse,
71
+ dotProductDense,
72
+ dotProductSparse,
73
+ jaccardSimilarityDense,
74
+ jaccardSimilaritySparse
75
+ };
package/stem.d.ts ADDED
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Porter stemmer for a single English word.
3
+ *
4
+ * @remarks
5
+ * Based on Porter stemmer in Javascript (various refactoring/optimizations):
6
+ *
7
+ * http://www.tartarus.org/~martin/PorterStemmer
8
+ *
9
+ * Original paper by Porter, 1980: "An algorithm for suffix
10
+ * stripping", Program, Vol. 14, no. 3, pp 130-137
11
+ *
12
+ * @param word
13
+ */
14
+ export declare const stemWord: (word: string) => string;
15
+ //# sourceMappingURL=stem.d.ts.map
package/stem.js ADDED
@@ -0,0 +1,110 @@
1
+ const SUFFIXES_STEP2 = {
2
+ alism: "al",
3
+ aliti: "al",
4
+ alli: "al",
5
+ anci: "ance",
6
+ ation: "ate",
7
+ ational: "ate",
8
+ ator: "ate",
9
+ biliti: "ble",
10
+ bli: "ble",
11
+ eli: "e",
12
+ enci: "ence",
13
+ entli: "ent",
14
+ fulness: "ful",
15
+ iveness: "ive",
16
+ iviti: "ive",
17
+ ization: "ize",
18
+ izer: "ize",
19
+ logi: "log",
20
+ ousli: "ous",
21
+ ousness: "ous",
22
+ tional: "tion"
23
+ };
24
+ const SUFFIXES_STEP3 = {
25
+ alize: "al",
26
+ ative: "",
27
+ ful: "",
28
+ ical: "ic",
29
+ icate: "ic",
30
+ iciti: "ic",
31
+ ness: ""
32
+ };
33
+ const c = "[^aeiou]";
34
+ const v = "[aeiouy]";
35
+ const C = c + "[^aeiouy]*";
36
+ const V = v + "[aeiou]*";
37
+ const RE_MGR0 = new RegExp("^(" + C + ")?" + V + C);
38
+ const RE_MEQ1 = new RegExp("^(" + C + ")?" + V + C + "(" + V + ")?$");
39
+ const RE_MGR1 = new RegExp("^(" + C + ")?" + V + C + V + C);
40
+ const RE_STEM_VOWEL = new RegExp("^(" + C + ")?" + v);
41
+ const OTHER = new RegExp("^" + C + v + "[^aeiouwxy]$");
42
+ const RE_STEP1A = /^(.+?)(ss|i)es$/;
43
+ const RE_STEP1A_ALT = /^(.+?)([^s])s$/;
44
+ const RE_STEP1B = /^(.+?)eed$/;
45
+ const RE_STEP1B_ALT = /^(.+?)(ed|ing)$/;
46
+ const RE_STEP1B_2 = /(at|bl|iz)$/;
47
+ const RE_STEP1B_3 = /([^aeiouylsz])\1$/;
48
+ const RE_STEP1B_4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
49
+ const RE_STEP1C = /^(.+?)y$/;
50
+ const RE_STEP2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
51
+ const RE_STEP3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
52
+ const RE_STEP4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
53
+ const RE_STEP4A = /^(.+?)(s|t)(ion)$/;
54
+ const RE_STEP5 = /^(.+?)e$/;
55
+ const stemWord = (word) => {
56
+ let initialY;
57
+ let fp;
58
+ let stem;
59
+ let suffix;
60
+ if (word.length < 3) return word;
61
+ initialY = word[0] === "y";
62
+ if (initialY) word = "Y" + word.substring(1);
63
+ word = word.replace(RE_STEP1A, "$1$2").replace(RE_STEP1A_ALT, "$1$2");
64
+ if (fp = RE_STEP1B.exec(word)) {
65
+ if (RE_MGR0.test(fp[1])) word = word.slice(0, -1);
66
+ } else if (fp = RE_STEP1B_ALT.exec(word)) {
67
+ stem = fp[1];
68
+ if (RE_STEM_VOWEL.test(stem)) {
69
+ word = stem;
70
+ if (RE_STEP1B_2.test(word)) {
71
+ word = word + "e";
72
+ } else if (RE_STEP1B_3.test(word)) {
73
+ word = word.slice(0, -1);
74
+ } else if (RE_STEP1B_4.test(word)) {
75
+ word = word + "e";
76
+ }
77
+ }
78
+ }
79
+ if (fp = RE_STEP1C.exec(word)) {
80
+ stem = fp[1];
81
+ if (RE_STEM_VOWEL.test(stem)) word = stem + "i";
82
+ }
83
+ if (fp = RE_STEP2.exec(word)) {
84
+ stem = fp[1];
85
+ suffix = fp[2];
86
+ if (RE_MGR0.test(stem)) word = stem + SUFFIXES_STEP2[suffix];
87
+ }
88
+ if (fp = RE_STEP3.exec(word)) {
89
+ stem = fp[1];
90
+ suffix = fp[2];
91
+ if (RE_MGR0.test(stem)) word = stem + SUFFIXES_STEP3[suffix];
92
+ }
93
+ if (fp = RE_STEP4.exec(word)) {
94
+ stem = fp[1];
95
+ if (RE_MGR1.test(stem)) word = stem;
96
+ } else if (fp = RE_STEP4A.exec(word)) {
97
+ stem = fp[1] + fp[2];
98
+ if (RE_MGR1.test(stem)) word = stem;
99
+ }
100
+ if (fp = RE_STEP5.exec(word)) {
101
+ stem = fp[1];
102
+ if (RE_MGR1.test(stem) || RE_MEQ1.test(stem) && !OTHER.test(stem))
103
+ word = stem;
104
+ }
105
+ if (word.endsWith("ll") && RE_MGR1.test(word)) word = word.slice(0, -1);
106
+ return initialY ? "y" + word.substring(1) : word;
107
+ };
108
+ export {
109
+ stemWord
110
+ };
@@ -0,0 +1,3 @@
1
+ export declare const DEFAULT_STOP_WORDS_EN: Set<string>;
2
+ export declare const removeStopWords: (words?: Set<string>) => import("@thi.ng/transducers").Transducer<string, string>;
3
+ //# sourceMappingURL=stop-words.d.ts.map
package/stop-words.js ADDED
@@ -0,0 +1,105 @@
1
+ import { filter } from "@thi.ng/transducers/filter";
2
+ const DEFAULT_STOP_WORDS_EN = /* @__PURE__ */ new Set([
3
+ "a",
4
+ "above",
5
+ "across",
6
+ "after",
7
+ "against",
8
+ "along",
9
+ "also",
10
+ "among",
11
+ "an",
12
+ "and",
13
+ "any",
14
+ "are",
15
+ "around",
16
+ "at",
17
+ "be",
18
+ "been",
19
+ "before",
20
+ "being",
21
+ "below",
22
+ "beneath",
23
+ "between",
24
+ "beyond",
25
+ "both",
26
+ "but",
27
+ "by",
28
+ "could",
29
+ "despite",
30
+ "do",
31
+ "does",
32
+ "down",
33
+ "during",
34
+ "each",
35
+ "eg",
36
+ "etc",
37
+ "even",
38
+ "every",
39
+ "few",
40
+ "for",
41
+ "from",
42
+ "he",
43
+ "i",
44
+ "ie",
45
+ "in",
46
+ "inside",
47
+ "into",
48
+ "is",
49
+ "it",
50
+ "just",
51
+ "less",
52
+ "many",
53
+ "may",
54
+ "might",
55
+ "more",
56
+ "much",
57
+ "must",
58
+ "of",
59
+ "on",
60
+ "one",
61
+ "onto",
62
+ "or",
63
+ "out",
64
+ "outside",
65
+ "over",
66
+ "over",
67
+ "quite",
68
+ "really",
69
+ "several",
70
+ "she",
71
+ "should",
72
+ "since",
73
+ "so",
74
+ "some",
75
+ "such",
76
+ "that",
77
+ "the",
78
+ "these",
79
+ "they",
80
+ "this",
81
+ "those",
82
+ "three",
83
+ "through",
84
+ "to",
85
+ "too",
86
+ "toward",
87
+ "two",
88
+ "under",
89
+ "until",
90
+ "up",
91
+ "very",
92
+ "was",
93
+ "we",
94
+ "were",
95
+ "with",
96
+ "within",
97
+ "without",
98
+ "would",
99
+ "you"
100
+ ]);
101
+ const removeStopWords = (words = DEFAULT_STOP_WORDS_EN) => filter((x) => !words.has(x));
102
+ export {
103
+ DEFAULT_STOP_WORDS_EN,
104
+ removeStopWords
105
+ };
package/tokenize.d.ts ADDED
@@ -0,0 +1,51 @@
1
+ import type { Transducer } from "@thi.ng/transducers";
2
+ /**
3
+ * Customizable string tokenizer with optional transducer-based token
4
+ * transformation(s). Yields an iterator of tokens.
5
+ *
6
+ * @remarks
7
+ * The package provides a number of composable string transducers which can be
8
+ * listed here and will be applied in sequence for each input token.
9
+ *
10
+ * @example
11
+ * ```ts tangle:../export/tokenize.ts
12
+ * import * as ta from "@thi.ng/text-analysis";
13
+ *
14
+ * const input = `Do not go gentle into that good night,
15
+ * Old age should burn and rave at close of day;
16
+ * Rage, rage against the dying of the light.
17
+ *
18
+ * Though wise men at their end know dark is right,
19
+ * Because their words had forked no lightning they
20
+ * Do not go gentle into that good night.`;
21
+ *
22
+ * // tokenize input with given token transforms
23
+ * // collect tokens into array
24
+ * const tokens = [...ta.tokenize(
25
+ * input,
26
+ * [
27
+ * ta.lowercase,
28
+ * ta.removeNonAlphaNum,
29
+ * ta.removeStopWords()
30
+ * ]
31
+ * )];
32
+ *
33
+ * console.log(tokens);
34
+ * // [
35
+ * // "do", "not", "go", "gentle", "good", "night", "old", "age",
36
+ * // "burn", "rave", "close", "day", "rage", "rage", "dying", "light",
37
+ * // ...
38
+ * // ]
39
+ *
40
+ * console.log(
41
+ * [...ta.tokenize(input, [ta.ngrams(2)])]
42
+ * );
43
+ * ```
44
+ *
45
+ * @param src
46
+ * @param xforms
47
+ * @param delim
48
+ * @param includeDelim
49
+ */
50
+ export declare const tokenize: (src: string, xforms?: Transducer<string, string>[], delim?: RegExp | string, includeDelim?: boolean) => IterableIterator<string>;
51
+ //# sourceMappingURL=tokenize.d.ts.map
package/tokenize.js ADDED
@@ -0,0 +1,10 @@
1
+ import { split } from "@thi.ng/strings";
2
+ import { comp } from "@thi.ng/transducers/comp";
3
+ import { iterator } from "@thi.ng/transducers/iterator";
4
+ const tokenize = (src, xforms, delim = /[ \t\n\r,;:/?!()\[\]]+/g, includeDelim = false) => {
5
+ const $src = split(src, delim, includeDelim);
6
+ return xforms ? iterator(comp(...xforms), $src) : $src;
7
+ };
8
+ export {
9
+ tokenize
10
+ };
package/vocab.d.ts ADDED
@@ -0,0 +1,166 @@
1
+ import { type BidirIndex, type SerializedBidirIndex } from "@thi.ng/bidir-index";
2
+ export type Vocab = BidirIndex<string>;
3
+ export type SerializedVocab = SerializedBidirIndex<string>;
4
+ /**
5
+ * Creates a bi-directional index storing unique tokens from given `src`
6
+ * iterable, optionally using custom `start` ID offset (default: 0). This index
7
+ * can then be used with {@link vectorize}, {@link vectorizeSparse}.
8
+ *
9
+ * @remarks
10
+ * This function is syntax sugar for
11
+ * [thi.ng/bidir-index](https://thi.ng/bidir-index).
12
+ *
13
+ * The vocab/index can be serialized to JSON and then re-created via
14
+ * `defVocab()`.
15
+ *
16
+ * @example
17
+ * ```ts tangle:../export/def-vocab.ts
18
+ * import { defVocab, tokenize } from "@thi.ng/text-analysis";
19
+ *
20
+ * const vocab = defVocab(
21
+ * tokenize("the quick brown fox jumps over the lazy dog")
22
+ * );
23
+ *
24
+ * console.log([...vocab.entries()]);
25
+ * // [
26
+ * // [ "the", 0 ],
27
+ * // [ "quick", 1 ],
28
+ * // [ "brown", 2 ],
29
+ * // [ "fox", 3 ],
30
+ * // [ "jumps", 4 ],
31
+ * // [ "over", 5 ],
32
+ * // [ "lazy", 6 ],
33
+ * // [ "dog", 7 ]
34
+ * // ]
35
+ *
36
+ * console.log(vocab.get("fox"))
37
+ * // 3
38
+ *
39
+ * console.log(vocab.getID(3))
40
+ * // "fox"
41
+ * ```
42
+ *
43
+ * @param src
44
+ * @param start
45
+ */
46
+ export declare function defVocab(src: Iterable<string>, start?: number): Vocab;
47
+ /**
48
+ * (Re)creates bi-directional vocab index from previous serialized state (e.g.
49
+ * via `vocab.toJSON()`).
50
+ *
51
+ * @param vocab
52
+ */
53
+ export declare function defVocab(vocab: SerializedVocab): Vocab;
54
+ /**
55
+ * Encodes the given `src` tokens into a dense multi-hot vector using provided
56
+ * `vocab` (created via {@link defVocab}). The vector size is the number of
57
+ * items in the vocab.
58
+ *
59
+ * @remarks
60
+ * Also see {@link encodeSparse}.
61
+ *
62
+ * @example
63
+ * ```ts tangle:../export/encode-dense.ts
64
+ * import { defVocab, encodeDense, tokenize } from "@thi.ng/text-analysis";
65
+ *
66
+ * const vocab = defVocab(
67
+ * tokenize("the quick brown fox jumps over the lazy dog")
68
+ * );
69
+ *
70
+ * console.log(encodeDense(vocab, tokenize("the brown dog jumps")));
71
+ * // [ 1, 0, 1, 0, 1, 0, 0, 1 ]
72
+ *
73
+ * console.log(encodeDense(vocab, tokenize("the lazy fox")));
74
+ * // [ 1, 0, 0, 1, 0, 0, 1, 0 ]
75
+ * ```
76
+ *
77
+ * @param vocab
78
+ * @param src
79
+ */
80
+ export declare const encodeDense: (vocab: Vocab, src: Iterable<string>) => any[];
81
+ /**
82
+ * Encodes the given `src` tokens into a sparse vector using provided `vocab`
83
+ * (created via {@link defVocab}). Only the IDs of matched tokens are stored.
84
+ * The returned vector size depends on the number of used/matched tokens, at
85
+ * most `vocab.size` (if entire vocab is used by `src`).
86
+ *
87
+ * @remarks
88
+ * Also see {@link encodeDense} for alternative encoding.
89
+ *
90
+ * @example
91
+ * ```ts tangle:../export/encode-sparse.ts
92
+ * import { defVocab, encodeSparse, tokenize } from "@thi.ng/text-analysis";
93
+ *
94
+ * const vocab = defVocab(
95
+ * tokenize("the quick brown fox jumps over the lazy dog")
96
+ * );
97
+ *
98
+ * console.log(encodeSparse(vocab, tokenize("the brown dog jumps")));
99
+ * // [ 0, 2, 4, 7 ]
100
+ *
101
+ * console.log(encodeSparse(vocab, tokenize("the lazy fox")));
102
+ * // [ 0, 3, 6 ]
103
+ * ```
104
+ *
105
+ * @param vocab
106
+ * @param src
107
+ */
108
+ export declare const encodeSparse: (vocab: Vocab, src: Iterable<string>) => number[];
109
+ /**
110
+ * Reverse op of {@link encodeDense}. Decodes dense multi-hot vector to extract
111
+ * tokens from provided `vocab` (created via {@link defVocab}). The returned
112
+ * array only contains the corresponding tokens of the vector's non-zero
113
+ * components.
114
+ *
115
+ * @remarks
116
+ * Also see {@link decodeSparse}.
117
+ *
118
+ * @example
119
+ * ```ts tangle:../export/decode-dense.ts
120
+ * import { defVocab, decodeDense, tokenize } from "@thi.ng/text-analysis";
121
+ *
122
+ * const vocab = defVocab(
123
+ * tokenize("the quick brown fox jumps over the lazy dog")
124
+ * );
125
+ *
126
+ * console.log(decodeDense(vocab, [1, 0, 1, 0, 1, 0, 0, 1]));
127
+ * // [ "the", "brown", "jumps", "dog" ]
128
+ *
129
+ * console.log(decodeDense(vocab, [1, 0, 0, 1, 0, 0, 1, 0]));
130
+ * // [ "the", "fox", "lazy" ]
131
+ * ```
132
+ *
133
+ * @param vocab
134
+ * @param src
135
+ * @param sort
136
+ */
137
+ export declare const decodeDense: (vocab: Vocab, vec: Iterable<number>) => string[];
138
+ /**
139
+ * Reverse op of {@link encodeSparse}. Decodes sparse vector (created via
140
+ * {@link encodeSparse} to extract tokens from provided `vocab` (created via
141
+ * {@link defVocab}).
142
+ *
143
+ * @remarks
144
+ * Also see {@link decodeDense}.
145
+ *
146
+ * @example
147
+ * ```ts tangle:../export/decode-sparse.ts
148
+ * import { defVocab, decodeSparse, tokenize } from "@thi.ng/text-analysis";
149
+ *
150
+ * const vocab = defVocab(
151
+ * tokenize("the quick brown fox jumps over the lazy dog")
152
+ * );
153
+ *
154
+ * console.log(decodeSparse(vocab, [0, 2, 4, 7]));
155
+ * // [ "the", "brown", "jumps", "dog" ]
156
+ *
157
+ * console.log(decodeSparse(vocab, [0, 3, 6]));
158
+ * // [ "the", "fox", "lazy" ]
159
+ * ```
160
+ *
161
+ * @param vocab
162
+ * @param src
163
+ * @param sort
164
+ */
165
+ export declare const decodeSparse: (vocab: Vocab, vec: Iterable<number>) => string[];
166
+ //# sourceMappingURL=vocab.d.ts.map
package/vocab.js ADDED
@@ -0,0 +1,31 @@
1
+ import {
2
+ bidirIndexFromJSON,
3
+ defBidirIndex
4
+ } from "@thi.ng/bidir-index";
5
+ import { isIterable } from "@thi.ng/checks/is-iterable";
6
+ function defVocab(src, start) {
7
+ return isIterable(src) ? defBidirIndex(src, { start }) : bidirIndexFromJSON(src);
8
+ }
9
+ const encodeDense = (vocab, src) => {
10
+ const vec = new Array(vocab.size).fill(0);
11
+ for (let i of vocab.getAll(src)) vec[i] = 1;
12
+ return vec;
13
+ };
14
+ const encodeSparse = (vocab, src) => [...vocab.getAllUnique(src)].sort((a, b) => a - b);
15
+ const decodeDense = (vocab, vec) => {
16
+ const res = [];
17
+ let i = 0;
18
+ for (let x of vec) {
19
+ if (x) res.push(vocab.getID(i));
20
+ i++;
21
+ }
22
+ return res;
23
+ };
24
+ const decodeSparse = (vocab, vec) => vocab.getAllIDs(vec);
25
+ export {
26
+ decodeDense,
27
+ decodeSparse,
28
+ defVocab,
29
+ encodeDense,
30
+ encodeSparse
31
+ };
package/xform.d.ts ADDED
@@ -0,0 +1,42 @@
1
+ import type { Vocab } from "./vocab.js";
2
+ /**
3
+ * Trnasducer to produce lowercase string.
4
+ */
5
+ export declare const lowercase: import("@thi.ng/transducers").Transducer<string, string>;
6
+ /**
7
+ * Trnasducer which collapses multiple whitespace chars into one.
8
+ */
9
+ export declare const collapseWS: import("@thi.ng/transducers").Transducer<string, string>;
10
+ /**
11
+ * Transducer which removes empty or whitespace-only strings/tokens.
12
+ */
13
+ export declare const removeEmpty: import("@thi.ng/transducers").Transducer<string, string>;
14
+ /**
15
+ * Transducer which removes non-alphabetic chars from input, using
16
+ * {@link RE_NON_ALPHA}.
17
+ */
18
+ export declare const removeNonAlpha: import("@thi.ng/transducers").Transducer<string, string>;
19
+ /**
20
+ * Transducer which removes non-alphabetic chars from input, using
21
+ * {@link RE_NON_ALPHANUM}.
22
+ */
23
+ export declare const removeNonAlphaNum: import("@thi.ng/transducers").Transducer<string, string>;
24
+ /**
25
+ * Transducer which removes tokens with their length outside the configured
26
+ * `[min,max]` range.
27
+ *
28
+ * @param min
29
+ * @param max
30
+ */
31
+ export declare const minMaxLength: (min: number, max: number) => import("@thi.ng/transducers").Transducer<string, string>;
32
+ /**
33
+ * Transducer version of {@link stemWord}.
34
+ */
35
+ export declare const stemOnly: import("@thi.ng/transducers").Transducer<string, string>;
36
+ /**
37
+ * Transducer which removes tokens which are not present in given `vocab`.
38
+ *
39
+ * @param vocab
40
+ */
41
+ export declare const vocabOnly: (vocab: Vocab) => import("@thi.ng/transducers").Transducer<string, string>;
42
+ //# sourceMappingURL=xform.d.ts.map