@thi.ng/text-analysis 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Change Log
2
2
 
3
- - **Last updated**: 2025-06-09T17:24:08Z
3
+ - **Last updated**: 2025-06-15T12:37:24Z
4
4
  - **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
5
5
 
6
6
  All notable changes to this project will be documented in this file.
@@ -11,6 +11,26 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
11
11
  **Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
12
12
  and/or version bumps of transitive dependencies.
13
13
 
14
+ ## [0.3.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.0) (2025-06-15)
15
+
16
+ #### 🚀 Features
17
+
18
+ - update kmeansDense ([d35b6bd](https://github.com/thi-ng/umbrella/commit/d35b6bd))
19
+ - update results to include original `docs` for each cluster
20
+
21
+ ## [0.2.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.2.0) (2025-06-14)
22
+
23
+ #### 🚀 Features
24
+
25
+ - add/migrate refactored tf-idf functions ([d311acc](https://github.com/thi-ng/umbrella/commit/d311acc))
26
+ - add/update vocab & vector encoding helpers, restructure ([9e4f60c](https://github.com/thi-ng/umbrella/commit/9e4f60c))
27
+ - add filterDocsIDF() ([f682b58](https://github.com/thi-ng/umbrella/commit/f682b58))
28
+ - add k-mean clustering fns ([3533843](https://github.com/thi-ng/umbrella/commit/3533843))
29
+
30
+ #### ♻️ Refactoring
31
+
32
+ - update imports/exports ([a44be87](https://github.com/thi-ng/umbrella/commit/a44be87))
33
+
14
34
  ## [0.1.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.1.0) (2025-06-09)
15
35
 
16
36
  #### 🚀 Features
package/README.md CHANGED
@@ -58,13 +58,16 @@ For Node.js REPL:
58
58
  const ta = await import("@thi.ng/text-analysis");
59
59
  ```
60
60
 
61
- Package sizes (brotli'd, pre-treeshake): ESM: 2.44 KB
61
+ Package sizes (brotli'd, pre-treeshake): ESM: 3.30 KB
62
62
 
63
63
  ## Dependencies
64
64
 
65
65
  - [@thi.ng/api](https://github.com/thi-ng/umbrella/tree/develop/packages/api)
66
+ - [@thi.ng/arrays](https://github.com/thi-ng/umbrella/tree/develop/packages/arrays)
66
67
  - [@thi.ng/bidir-index](https://github.com/thi-ng/umbrella/tree/develop/packages/bidir-index)
67
68
  - [@thi.ng/checks](https://github.com/thi-ng/umbrella/tree/develop/packages/checks)
69
+ - [@thi.ng/distance](https://github.com/thi-ng/umbrella/tree/develop/packages/distance)
70
+ - [@thi.ng/k-means](https://github.com/thi-ng/umbrella/tree/develop/packages/k-means)
68
71
  - [@thi.ng/strings](https://github.com/thi-ng/umbrella/tree/develop/packages/strings)
69
72
  - [@thi.ng/transducers](https://github.com/thi-ng/umbrella/tree/develop/packages/transducers)
70
73
  - [@thi.ng/vectors](https://github.com/thi-ng/umbrella/tree/develop/packages/vectors)
package/api.d.ts ADDED
@@ -0,0 +1,4 @@
1
+ import type { BidirIndex, SerializedBidirIndex } from "@thi.ng/bidir-index";
2
+ export type Vocab = BidirIndex<string>;
3
+ export type SerializedVocab = SerializedBidirIndex<string>;
4
+ //# sourceMappingURL=api.d.ts.map
package/api.js ADDED
File without changes
package/cluster.d.ts ADDED
@@ -0,0 +1,103 @@
1
+ import { Untransformed } from "@thi.ng/distance/untransformed";
2
+ import { type KMeansOpts } from "@thi.ng/k-means";
3
+ import type { ReadonlyVec } from "@thi.ng/vectors";
4
+ import type { Vocab } from "./api.js";
5
+ /**
6
+ * Jaccard distance metric wrapper for {@link kmeansDense}
7
+ */
8
+ export declare const JACCARD_DIST_DENSE: Untransformed<ReadonlyVec>;
9
+ /**
10
+ * k-means clustering for dense multi-hot vectors. Uses thi.ng/k-means for
11
+ * actual clustering and squared L2 as default distance metric.
12
+ * Default max. iterations = 100.
13
+ *
14
+ * @remarks
15
+ * Use {@link JACCARD_DIST_DENSE} for alternative distance metric.
16
+ *
17
+ * @param k
18
+ * @param docs
19
+ * @param opts
20
+ */
21
+ export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partial<KMeansOpts>) => {
22
+ docs: ReadonlyVec[];
23
+ id: number;
24
+ centroid: ReadonlyVec;
25
+ items: number[];
26
+ }[];
27
+ /**
28
+ * k-means clustering for sparse multi-hot vectors. First converts vectors into
29
+ * dense versions (using {@link toDense}), then calls {@link kmeansDense} to
30
+ * perform the clustering.
31
+ *
32
+ * @remarks
33
+ * Since sparse vector sizes vary, the number of dimensions used (aka the
34
+ * vocabulary size) MUST be given via `opts`.
35
+ *
36
+ * @param k
37
+ * @param docs
38
+ * @param opts
39
+ */
40
+ export declare const kmeansSparse: (k: number, docs: ReadonlyVec[], opts: Partial<KMeansOpts> & {
41
+ dim: number;
42
+ }) => {
43
+ docs: ReadonlyVec[];
44
+ id: number;
45
+ centroid: ReadonlyVec;
46
+ items: number[];
47
+ }[];
48
+ export declare function clusterBounds(docs: ReadonlyVec[]): {
49
+ centroid: ReadonlyVec;
50
+ radius: number;
51
+ };
52
+ export declare function clusterBounds(docs: ReadonlyVec[], ids: number[]): {
53
+ centroid: ReadonlyVec;
54
+ radius: number;
55
+ };
56
+ /**
57
+ * Takes a vocab and array of docs encoded as dense multi-hot vectors. Computes
58
+ * centroid of given docs and then calls {@link centralTermsVec} to return the
59
+ * `k`-most central terms (or less if there're insufficient non-zero vector
60
+ * components).
61
+ *
62
+ * @example
63
+ * ```ts tangle:../export/central-terms.ts
64
+ * import { centralTerms, encodeAllDense } from "@thi.ng/text-analysis";
65
+ *
66
+ * const inputs = [
67
+ * ["a", "b", "c"],
68
+ * ["a", "b", "d", "e"],
69
+ * ["b", "f", "g"],
70
+ * ["a", "b", "c", "f"],
71
+ * ["a", "g", "h"]
72
+ * ];
73
+ *
74
+ * // create vocab & encode documents into multi-hot vectors
75
+ * const { vocab, docs } = encodeAllDense(inputs);
76
+ *
77
+ * // extract top-4 common terms
78
+ * console.log(centralTerms(vocab, 4, docs));
79
+ * // [ "b", "a", "g", "f" ]
80
+ * ```
81
+ *
82
+ * @param vocab
83
+ * @param k
84
+ * @param docs
85
+ */
86
+ export declare const centralTerms: (vocab: Vocab, k: number, docs: ReadonlyVec[]) => string[];
87
+ /**
88
+ * Takes a vocab and dense vector representing a point in the n-dimensional
89
+ * space of the given vocab. Returns an array of terms corresponding to the `k`
90
+ * largest non-zero components of the vector (or less if there're insufficient
91
+ * non-zero vector components).
92
+ *
93
+ * @remarks
94
+ * Also see {@link centralTerms} (incl. code example).
95
+ *
96
+ * @param vocab
97
+ * @param k
98
+ * @param centroid
99
+ */
100
+ export declare const centralTermsVec: (vocab: Vocab, k: number, centroid: ReadonlyVec) => string[];
101
+ export declare const knearest: (query: ReadonlyVec, k: number, r?: number, dist?: Untransformed<ReadonlyVec>, sorted?: boolean) => import("@thi.ng/distance").KNearest<ReadonlyVec, unknown>;
102
+ export declare const knearestDocs: (query: ReadonlyVec, k: number, docs: ReadonlyVec[], r?: number, dist?: Untransformed<ReadonlyVec>, sorted?: boolean) => [ReadonlyVec, number][];
103
+ //# sourceMappingURL=cluster.d.ts.map
package/cluster.js ADDED
@@ -0,0 +1,54 @@
1
+ import { argSort } from "@thi.ng/arrays/arg-sort";
2
+ import { lookup } from "@thi.ng/arrays/lookup";
3
+ import { knearest as $knearest } from "@thi.ng/distance/knearest";
4
+ import { Untransformed } from "@thi.ng/distance/untransformed";
5
+ import { kmeans } from "@thi.ng/k-means";
6
+ import { map } from "@thi.ng/transducers/map";
7
+ import { max } from "@thi.ng/transducers/max";
8
+ import { transduce } from "@thi.ng/transducers/transduce";
9
+ import { distJaccard } from "@thi.ng/vectors/dist-jaccard";
10
+ import { distSq } from "@thi.ng/vectors/distsq";
11
+ import { mean } from "@thi.ng/vectors/mean";
12
+ import { toDense } from "./vec.js";
13
+ const JACCARD_DIST_DENSE = new Untransformed(distJaccard);
14
+ const kmeansDense = (k, docs, opts) => kmeans(k, docs, { maxIter: 100, ...opts }).map((cluster) => ({
15
+ ...cluster,
16
+ docs: lookup(docs, cluster.items)
17
+ }));
18
+ const kmeansSparse = (k, docs, opts) => kmeansDense(
19
+ k,
20
+ docs.map((x) => toDense(opts.dim, x)),
21
+ opts
22
+ );
23
+ function clusterBounds(docs, ids) {
24
+ if (ids) docs = lookup(docs, ids);
25
+ const centroid = mean([], docs);
26
+ return {
27
+ centroid,
28
+ radius: transduce(
29
+ map((x) => distSq(centroid, x)),
30
+ max(),
31
+ docs
32
+ )
33
+ };
34
+ }
35
+ const centralTerms = (vocab, k, docs) => centralTermsVec(vocab, k, mean([], docs));
36
+ const centralTermsVec = (vocab, k, centroid) => vocab.getAllIDs(
37
+ argSort(centroid, (a, b) => b - a).slice(0, k).filter(Boolean)
38
+ );
39
+ const knearest = (query, k, r = Infinity, dist = JACCARD_DIST_DENSE, sorted = false) => $knearest(query, k, r, dist, sorted);
40
+ const knearestDocs = (query, k, docs, r = Infinity, dist = JACCARD_DIST_DENSE, sorted = false) => {
41
+ const neighborhood = $knearest(query, k, r, dist, sorted);
42
+ for (let i = 0; i < docs.length; i++) neighborhood.consider(docs[i], i);
43
+ return neighborhood.deref().map((n) => [docs[n[1]], n[0]]);
44
+ };
45
+ export {
46
+ JACCARD_DIST_DENSE,
47
+ centralTerms,
48
+ centralTermsVec,
49
+ clusterBounds,
50
+ kmeansDense,
51
+ kmeansSparse,
52
+ knearest,
53
+ knearestDocs
54
+ };
package/index.d.ts CHANGED
@@ -1,10 +1,13 @@
1
+ export * from "./cluster.js";
1
2
  export * from "./frequencies.js";
2
3
  export * from "./ngrams.js";
3
4
  export * from "./replace.js";
4
5
  export * from "./similarity.js";
5
6
  export * from "./stem.js";
6
7
  export * from "./stop-words.js";
8
+ export * from "./tf-idf.js";
7
9
  export * from "./tokenize.js";
10
+ export * from "./vec.js";
8
11
  export * from "./vocab.js";
9
12
  export * from "./xform.js";
10
13
  //# sourceMappingURL=index.d.ts.map
package/index.js CHANGED
@@ -1,9 +1,12 @@
1
+ export * from "./cluster.js";
1
2
  export * from "./frequencies.js";
2
3
  export * from "./ngrams.js";
3
4
  export * from "./replace.js";
4
5
  export * from "./similarity.js";
5
6
  export * from "./stem.js";
6
7
  export * from "./stop-words.js";
8
+ export * from "./tf-idf.js";
7
9
  export * from "./tokenize.js";
10
+ export * from "./vec.js";
8
11
  export * from "./vocab.js";
9
12
  export * from "./xform.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@thi.ng/text-analysis",
3
- "version": "0.1.0",
3
+ "version": "0.3.0",
4
4
  "description": "Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities",
5
5
  "type": "module",
6
6
  "module": "./index.js",
@@ -40,10 +40,13 @@
40
40
  },
41
41
  "dependencies": {
42
42
  "@thi.ng/api": "^8.11.29",
43
+ "@thi.ng/arrays": "^2.12.0",
43
44
  "@thi.ng/bidir-index": "^1.3.0",
44
45
  "@thi.ng/checks": "^3.7.9",
46
+ "@thi.ng/distance": "^3.0.0",
47
+ "@thi.ng/k-means": "^1.1.0",
45
48
  "@thi.ng/strings": "^3.9.14",
46
- "@thi.ng/transducers": "^9.4.0",
49
+ "@thi.ng/transducers": "^9.4.1",
47
50
  "@thi.ng/vectors": "^8.3.0"
48
51
  },
49
52
  "devDependencies": {
@@ -52,17 +55,28 @@
52
55
  "typescript": "^5.8.3"
53
56
  },
54
57
  "keywords": [
58
+ "analysis",
59
+ "centroid",
60
+ "cluster",
55
61
  "composition",
56
62
  "decode",
63
+ "dense",
57
64
  "encode",
65
+ "frequency",
58
66
  "functional",
67
+ "histogram",
68
+ "k-means",
59
69
  "ngram",
60
70
  "pipeline",
61
71
  "similarity",
72
+ "sparse",
62
73
  "stem",
63
74
  "text",
75
+ "tf-idf",
64
76
  "tokenizer",
77
+ "transducer",
65
78
  "typescript",
79
+ "vocabulary",
66
80
  "vector"
67
81
  ],
68
82
  "publishConfig": {
@@ -83,6 +97,12 @@
83
97
  ".": {
84
98
  "default": "./index.js"
85
99
  },
100
+ "./api": {
101
+ "default": "./api.js"
102
+ },
103
+ "./cluster": {
104
+ "default": "./cluster.js"
105
+ },
86
106
  "./frequencies": {
87
107
  "default": "./frequencies.js"
88
108
  },
@@ -101,9 +121,15 @@
101
121
  "./stop-words": {
102
122
  "default": "./stop-words.js"
103
123
  },
124
+ "./tf-idf": {
125
+ "default": "./tf-idf.js"
126
+ },
104
127
  "./tokenize": {
105
128
  "default": "./tokenize.js"
106
129
  },
130
+ "./vec": {
131
+ "default": "./vec.js"
132
+ },
107
133
  "./vocab": {
108
134
  "default": "./vocab.js"
109
135
  },
@@ -115,5 +141,5 @@
115
141
  "status": "alpha",
116
142
  "year": 2021
117
143
  },
118
- "gitHead": "93cdcd8db4d4669561a7f0ebc47697bdbfd04214\n"
144
+ "gitHead": "4635a24acc2623894887ca31189fdffda87ff9d3\n"
119
145
  }
package/similarity.d.ts CHANGED
@@ -8,7 +8,7 @@
8
8
  * @param a
9
9
  * @param b
10
10
  */
11
- export declare const cosineSimilarityDense: import("@thi.ng/vectors/api").DistanceFn;
11
+ export declare const cosineSimilarityDense: import("@thi.ng/vectors").DistanceFn;
12
12
  /**
13
13
  * Computes cosine similarity of given sparse multi-hot vectors.
14
14
  *
@@ -26,7 +26,7 @@ export declare const cosineSimilaritySparse: (a: ArrayLike<number>, b: ArrayLike
26
26
  * @param a
27
27
  * @param b
28
28
  */
29
- export declare const jaccardSimilarityDense: import("@thi.ng/vectors/api").DistanceFn;
29
+ export declare const jaccardSimilarityDense: import("@thi.ng/vectors").DistanceFn;
30
30
  /**
31
31
  * Computes Jaccard similarity of given sparse multi-hot vectors.
32
32
  *
@@ -44,7 +44,7 @@ export declare const jaccardSimilaritySparse: (a: ArrayLike<number>, b: ArrayLik
44
44
  * @param a
45
45
  * @param b
46
46
  */
47
- export declare const dotProductDense: import("@thi.ng/vectors/api").MultiVecOpRoVV<number>;
47
+ export declare const dotProductDense: import("@thi.ng/vectors").MultiVecOpRoVV<number>;
48
48
  /**
49
49
  * Computes dot product of the given sparse multi-hot vectors.
50
50
  *
@@ -62,7 +62,7 @@ export declare const dotProductSparse: (a: ArrayLike<number>, b: ArrayLike<numbe
62
62
  * @param a
63
63
  * @param b
64
64
  */
65
- export declare const distSqDense: import("@thi.ng/vectors/api").MultiVecOpRoVV<number>;
65
+ export declare const distSqDense: import("@thi.ng/vectors").MultiVecOpRoVV<number>;
66
66
  /**
67
67
  * Computes the squared L2 distance of the given sparse multi-hot vectors.
68
68
  *
package/tf-idf.d.ts ADDED
@@ -0,0 +1,101 @@
1
+ import type { Fn2 } from "@thi.ng/api";
2
+ import type { Vocab } from "./api.js";
3
+ export declare const tfCount: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
4
+ export declare const tfNormalized: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
5
+ export declare const tfLog: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
6
+ export declare const defIDF: (fnIDF: (count: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
7
+ export declare const idfClassic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
8
+ export declare const idfSmooth: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
9
+ export declare const idfProbabilistic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
10
+ /**
11
+ * Higher-order customizable tf-idf implementation, using provided fns for term
12
+ * frequency and inverse document frequency.
13
+ *
14
+ * @remarks
15
+ * See {@link tfidf} for default impl.
16
+ *
17
+ * Also see:
18
+ *
19
+ * - {@link tfCount}, {@link tfNormalized}, {@link tfLog}
20
+ * - {@link idfClassic}, {@link idfSmooth}, {@link idfProbabilistic}.
21
+ *
22
+ * References:
23
+ *
24
+ * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
25
+ *
26
+ *
27
+ * @param fnTF
28
+ * @param fnIDF
29
+ */
30
+ export declare const defTFIDF: (fnTF: Fn2<Vocab, string[], Map<string, number>>, fnIDF: Fn2<Vocab, string[][], Map<string, number>>) => (vocab: Vocab, tokenizedDocs: string[][]) => {
31
+ doc: string[];
32
+ tf: Map<string, number>;
33
+ idf: Map<string, number>;
34
+ tfidf: Map<string, number>;
35
+ }[];
36
+ /**
37
+ * Default tf-idf implementation, using {@link tfNormalized} for term
38
+ * frequency and {@link idfClassic} for inverse document frequency
39
+ * calculation.
40
+ *
41
+ * @remarks
42
+ * References:
43
+ *
44
+ * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
45
+ *
46
+ * Also see {@link defTFIDF}
47
+ *
48
+ * @param vocab
49
+ * @param tokenizedDocs
50
+ */
51
+ export declare const tfidf: (vocab: Vocab, tokenizedDocs: string[][]) => {
52
+ doc: string[];
53
+ tf: Map<string, number>;
54
+ idf: Map<string, number>;
55
+ tfidf: Map<string, number>;
56
+ }[];
57
+ /**
58
+ * Takes a vocab, an array of tokenized documents and a predicate function.
59
+ * Computes the IDF (Inverse Document Frequency, default: {@link idfClassic})
60
+ * and then filters each document using supplied predicate, which is called with
61
+ * a single word/token and its computed IDF. Only words are kept for which the
62
+ * predicate succeeds.
63
+ *
64
+ * @remarks
65
+ * The IDF for common words is close to zero. This function can be used as a
66
+ * pre-processing step for improved and more efficient vocabulary construction,
67
+ * vector encoding (e.g. via {@link encodeDense}), clustering etc. by pre-excluding
68
+ * tokens which do not contribute much information.
69
+ *
70
+ * @example
71
+ * ```ts tangle:../export/filter-docs-idf.ts
72
+ * import { filterDocsIDF } from "@thi.ng/text-analysis";
73
+ *
74
+ * const docs = [
75
+ * ["a", "b", "c"],
76
+ * ["a", "b", "d", "e"],
77
+ * ["b", "f", "g"],
78
+ * ["a", "b", "c", "f"],
79
+ * ["a", "g", "h"]
80
+ * ];
81
+ *
82
+ * // remove common words, i.e. those with an IDF below given threshold
83
+ * const filtered = filterDocsIDF(docs, (_, x) => x > 0.3);
84
+ *
85
+ * // show before & after
86
+ * for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]);
87
+ *
88
+ * // [ "a", "b", "c" ] => [ "c" ]
89
+ * // [ "a", "b", "d", "e" ] => [ "d", "e" ]
90
+ * // [ "b", "f", "g" ] => [ "f", "g" ]
91
+ * // [ "a", "b", "c", "f" ] => [ "c", "f" ]
92
+ * // [ "a", "g", "h" ] => [ "g", "h" ]
93
+ * ```
94
+ *
95
+ * @param docs
96
+ * @param pred
97
+ * @param vocab
98
+ * @param fnIDF
99
+ */
100
+ export declare const filterDocsIDF: (docs: string[][], pred: Fn2<string, number, boolean>, vocab?: Vocab, fnIDF?: Fn2<Vocab, string[][], Map<string, number>>) => string[][];
101
+ //# sourceMappingURL=tf-idf.d.ts.map
package/tf-idf.js ADDED
@@ -0,0 +1,63 @@
1
+ import { transduce } from "@thi.ng/transducers/transduce";
2
+ import { frequencies, normFrequencies } from "./frequencies.js";
3
+ import { vocabOnly } from "./xform.js";
4
+ import { defVocab } from "./vocab.js";
5
+ const { log10 } = Math;
6
+ const tfCount = (vocab, docTokens) => transduce(vocabOnly(vocab), frequencies(), docTokens);
7
+ const tfNormalized = (vocab, docTokens) => transduce(vocabOnly(vocab), normFrequencies(), docTokens);
8
+ const tfLog = (vocab, docTokens) => {
9
+ const res = transduce(vocabOnly(vocab), frequencies(), docTokens);
10
+ for (const [word, count] of res) res.set(word, log10(1 + count));
11
+ return res;
12
+ };
13
+ const defIDF = (fnIDF) => (vocab, tokenizedDocs) => {
14
+ const acc = /* @__PURE__ */ new Map();
15
+ for (const word of vocab.keys()) {
16
+ let n = 0;
17
+ for (const doc of tokenizedDocs) {
18
+ if (doc.includes(word)) n++;
19
+ }
20
+ acc.set(word, fnIDF(n, tokenizedDocs.length));
21
+ }
22
+ return acc;
23
+ };
24
+ const idfClassic = defIDF(
25
+ (count, numDocs) => Math.log10(numDocs / count)
26
+ );
27
+ const idfSmooth = defIDF(
28
+ (count, numDocs) => 1 + log10(numDocs / (1 + count))
29
+ );
30
+ const idfProbabilistic = defIDF(
31
+ (count, numDocs) => log10((numDocs - count) / count)
32
+ );
33
+ const defTFIDF = (fnTF, fnIDF) => (vocab, tokenizedDocs) => {
34
+ const idf = fnIDF(vocab, tokenizedDocs);
35
+ return tokenizedDocs.map((doc) => {
36
+ const tf = fnTF(vocab, doc);
37
+ const acc = /* @__PURE__ */ new Map();
38
+ for (const [word, f] of tf) {
39
+ acc.set(word, f * idf.get(word));
40
+ }
41
+ return { doc, tf, idf, tfidf: acc };
42
+ });
43
+ };
44
+ const tfidf = defTFIDF(tfNormalized, idfClassic);
45
+ const filterDocsIDF = (docs, pred, vocab, fnIDF = idfClassic) => {
46
+ if (!vocab) vocab = defVocab(docs);
47
+ const idf = fnIDF(vocab, docs);
48
+ return docs.map(
49
+ (doc) => doc.filter((word) => vocab.has(word) && pred(word, idf.get(word)))
50
+ );
51
+ };
52
+ export {
53
+ defIDF,
54
+ defTFIDF,
55
+ filterDocsIDF,
56
+ idfClassic,
57
+ idfProbabilistic,
58
+ idfSmooth,
59
+ tfCount,
60
+ tfLog,
61
+ tfNormalized,
62
+ tfidf
63
+ };
package/vec.d.ts ADDED
@@ -0,0 +1,150 @@
1
+ import type { ReadonlyVec } from "@thi.ng/vectors";
2
+ import type { Vocab } from "./api.js";
3
+ /**
4
+ * Encodes the given `doc` tokens into a dense multi-hot vector using provided
5
+ * `vocab` (e.g. created via {@link defVocab}). The vector size is the number of
6
+ * items in the vocab.
7
+ *
8
+ * @remarks
9
+ * Also see {@link encodeSparse}.
10
+ *
11
+ * @example
12
+ * ```ts tangle:../export/encode-dense.ts
13
+ * import { defVocab, encodeDense, tokenize } from "@thi.ng/text-analysis";
14
+ *
15
+ * const vocab = defVocab(
16
+ * tokenize("the quick brown fox jumps over the lazy dog")
17
+ * );
18
+ *
19
+ * console.log(encodeDense(vocab, tokenize("the brown dog jumps")));
20
+ * // [ 1, 0, 1, 0, 1, 0, 0, 1 ]
21
+ *
22
+ * console.log(encodeDense(vocab, tokenize("the lazy fox")));
23
+ * // [ 1, 0, 0, 1, 0, 0, 1, 0 ]
24
+ * ```
25
+ *
26
+ * @param vocab
27
+ * @param doc
28
+ */
29
+ export declare const encodeDense: (vocab: Vocab, doc: Iterable<string>) => ReadonlyVec;
30
+ /**
31
+ * Convenience function to create a vocabulary from given docs and encode each
32
+ * doc into a dense multi-hot vector (using {@link encodeDense}).
33
+ *
34
+ * @param docs
35
+ */
36
+ export declare const encodeAllDense: (docs: string[][]) => {
37
+ vocab: Vocab;
38
+ docs: ReadonlyVec[];
39
+ };
40
+ /**
41
+ * Encodes the given `src` tokens into a sparse vector using provided `vocab`
42
+ * (created via {@link defVocab}). Only the IDs of matched tokens are stored.
43
+ * The returned vector size depends on the number of used/matched tokens, at
44
+ * most `vocab.size` (if entire vocab is used by `src`).
45
+ *
46
+ * @remarks
47
+ * Also see {@link encodeDense} for alternative encoding.
48
+ *
49
+ * @example
50
+ * ```ts tangle:../export/encode-sparse.ts
51
+ * import { defVocab, encodeSparse, tokenize } from "@thi.ng/text-analysis";
52
+ *
53
+ * const vocab = defVocab(
54
+ * tokenize("the quick brown fox jumps over the lazy dog")
55
+ * );
56
+ *
57
+ * console.log(encodeSparse(vocab, tokenize("the brown dog jumps")));
58
+ * // [ 0, 2, 4, 7 ]
59
+ *
60
+ * console.log(encodeSparse(vocab, tokenize("the lazy fox")));
61
+ * // [ 0, 3, 6 ]
62
+ * ```
63
+ *
64
+ * @param vocab
65
+ * @param src
66
+ */
67
+ export declare const encodeSparse: (vocab: Vocab, src: Iterable<string>) => ReadonlyVec;
68
+ /**
69
+ * Convenience function to create a vocabulary from given docs and encode each
70
+ * doc into a sparse multi-hot vector (using {@link encodeSparse}).
71
+ *
72
+ * @param docs
73
+ */
74
+ export declare const encodeAllSparse: (docs: string[][]) => {
75
+ vocab: Vocab;
76
+ docs: ReadonlyVec[];
77
+ };
78
+ /**
79
+ * Reverse op of {@link encodeDense}. Decodes dense multi-hot vector to extract
80
+ * tokens from provided `vocab` (created via {@link defVocab}). The returned
81
+ * array only contains the corresponding tokens of the vector's non-zero
82
+ * components.
83
+ *
84
+ * @remarks
85
+ * Also see {@link decodeSparse}.
86
+ *
87
+ * @example
88
+ * ```ts tangle:../export/decode-dense.ts
89
+ * import { defVocab, decodeDense, tokenize } from "@thi.ng/text-analysis";
90
+ *
91
+ * const vocab = defVocab(
92
+ * tokenize("the quick brown fox jumps over the lazy dog")
93
+ * );
94
+ *
95
+ * console.log(decodeDense(vocab, [1, 0, 1, 0, 1, 0, 0, 1]));
96
+ * // [ "the", "brown", "jumps", "dog" ]
97
+ *
98
+ * console.log(decodeDense(vocab, [1, 0, 0, 1, 0, 0, 1, 0]));
99
+ * // [ "the", "fox", "lazy" ]
100
+ * ```
101
+ *
102
+ * @param vocab
103
+ * @param src
104
+ * @param sort
105
+ */
106
+ export declare const decodeDense: (vocab: Vocab, vec: Iterable<number>) => string[];
107
+ /**
108
+ * Reverse op of {@link encodeSparse}. Decodes sparse vector (created via
109
+ * {@link encodeSparse} to extract tokens from provided `vocab` (created via
110
+ * {@link defVocab}).
111
+ *
112
+ * @remarks
113
+ * Also see {@link decodeDense}.
114
+ *
115
+ * @example
116
+ * ```ts tangle:../export/decode-sparse.ts
117
+ * import { defVocab, decodeSparse, tokenize } from "@thi.ng/text-analysis";
118
+ *
119
+ * const vocab = defVocab(
120
+ * tokenize("the quick brown fox jumps over the lazy dog")
121
+ * );
122
+ *
123
+ * console.log(decodeSparse(vocab, [0, 2, 4, 7]));
124
+ * // [ "the", "brown", "jumps", "dog" ]
125
+ *
126
+ * console.log(decodeSparse(vocab, [0, 3, 6]));
127
+ * // [ "the", "fox", "lazy" ]
128
+ * ```
129
+ *
130
+ * @param vocab
131
+ * @param src
132
+ * @param sort
133
+ */
134
+ export declare const decodeSparse: (vocab: Vocab, vec: Iterable<number>) => string[];
135
+ /**
136
+ * Converts given multi-hot sparse vector (e.g. created via {@link encodeSparse}
137
+ * into a dense representation.
138
+ *
139
+ * @param dim
140
+ * @param sparse
141
+ */
142
+ export declare const toDense: (dim: number, sparse: ReadonlyVec) => ReadonlyVec;
143
+ /**
144
+ * Converts given multi-hot dense vector (e.g. created via {@link encodeDense})
145
+ * into a sparse representation.
146
+ *
147
+ * @param dense
148
+ */
149
+ export declare const toSparse: (dense: ReadonlyVec) => ReadonlyVec;
150
+ //# sourceMappingURL=vec.d.ts.map
package/vec.js ADDED
@@ -0,0 +1,49 @@
1
+ import { defVocab } from "./vocab.js";
2
+ const encodeDense = (vocab, doc) => toDense(vocab.size, vocab.getAll(doc));
3
+ const encodeAllDense = (docs) => {
4
+ const vocab = defVocab(docs);
5
+ return {
6
+ vocab,
7
+ docs: docs.map((x) => encodeDense(vocab, x))
8
+ };
9
+ };
10
+ const encodeSparse = (vocab, src) => [...vocab.getAllUnique(src)].sort((a, b) => a - b);
11
+ const encodeAllSparse = (docs) => {
12
+ const vocab = defVocab(docs);
13
+ return {
14
+ vocab,
15
+ docs: docs.map((x) => encodeSparse(vocab, x))
16
+ };
17
+ };
18
+ const decodeDense = (vocab, vec) => {
19
+ const res = [];
20
+ let i = 0;
21
+ for (const x of vec) {
22
+ if (x) res.push(vocab.getID(i));
23
+ i++;
24
+ }
25
+ return res;
26
+ };
27
+ const decodeSparse = (vocab, vec) => vocab.getAllIDs(vec);
28
+ const toDense = (dim, sparse) => {
29
+ const res = new Array(dim).fill(0);
30
+ for (const i of sparse) res[i] = 1;
31
+ return res;
32
+ };
33
+ const toSparse = (dense) => {
34
+ const res = [];
35
+ for (let i = 0, n = dense.length; i < n; i++) {
36
+ if (dense[i]) res.push(i);
37
+ }
38
+ return res;
39
+ };
40
+ export {
41
+ decodeDense,
42
+ decodeSparse,
43
+ encodeAllDense,
44
+ encodeAllSparse,
45
+ encodeDense,
46
+ encodeSparse,
47
+ toDense,
48
+ toSparse
49
+ };
package/vocab.d.ts CHANGED
@@ -1,10 +1,9 @@
1
- import { type BidirIndex, type SerializedBidirIndex } from "@thi.ng/bidir-index";
2
- export type Vocab = BidirIndex<string>;
3
- export type SerializedVocab = SerializedBidirIndex<string>;
1
+ import type { SerializedVocab, Vocab } from "./api.js";
4
2
  /**
5
3
  * Creates a bi-directional index storing unique tokens from given `src`
6
4
  * iterable, optionally using custom `start` ID offset (default: 0). This index
7
- * can then be used with {@link vectorize}, {@link vectorizeSparse}.
5
+ * can then be used with {@link encodeDense}, {@link encodeSparse} and related
6
+ * functions.
8
7
  *
9
8
  * @remarks
10
9
  * This function is syntax sugar for
@@ -43,7 +42,7 @@ export type SerializedVocab = SerializedBidirIndex<string>;
43
42
  * @param src
44
43
  * @param start
45
44
  */
46
- export declare function defVocab(src: Iterable<string>, start?: number): Vocab;
45
+ export declare function defVocab(src: Iterable<string> | Iterable<string>[], start?: number): Vocab;
47
46
  /**
48
47
  * (Re)creates bi-directional vocab index from previous serialized state (e.g.
49
48
  * via `vocab.toJSON()`).
@@ -51,116 +50,4 @@ export declare function defVocab(src: Iterable<string>, start?: number): Vocab;
51
50
  * @param vocab
52
51
  */
53
52
  export declare function defVocab(vocab: SerializedVocab): Vocab;
54
- /**
55
- * Encodes the given `src` tokens into a dense multi-hot vector using provided
56
- * `vocab` (created via {@link defVocab}). The vector size is the number of
57
- * items in the vocab.
58
- *
59
- * @remarks
60
- * Also see {@link encodeSparse}.
61
- *
62
- * @example
63
- * ```ts tangle:../export/encode-dense.ts
64
- * import { defVocab, encodeDense, tokenize } from "@thi.ng/text-analysis";
65
- *
66
- * const vocab = defVocab(
67
- * tokenize("the quick brown fox jumps over the lazy dog")
68
- * );
69
- *
70
- * console.log(encodeDense(vocab, tokenize("the brown dog jumps")));
71
- * // [ 1, 0, 1, 0, 1, 0, 0, 1 ]
72
- *
73
- * console.log(encodeDense(vocab, tokenize("the lazy fox")));
74
- * // [ 1, 0, 0, 1, 0, 0, 1, 0 ]
75
- * ```
76
- *
77
- * @param vocab
78
- * @param src
79
- */
80
- export declare const encodeDense: (vocab: Vocab, src: Iterable<string>) => any[];
81
- /**
82
- * Encodes the given `src` tokens into a sparse vector using provided `vocab`
83
- * (created via {@link defVocab}). Only the IDs of matched tokens are stored.
84
- * The returned vector size depends on the number of used/matched tokens, at
85
- * most `vocab.size` (if entire vocab is used by `src`).
86
- *
87
- * @remarks
88
- * Also see {@link encodeDense} for alternative encoding.
89
- *
90
- * @example
91
- * ```ts tangle:../export/encode-sparse.ts
92
- * import { defVocab, encodeSparse, tokenize } from "@thi.ng/text-analysis";
93
- *
94
- * const vocab = defVocab(
95
- * tokenize("the quick brown fox jumps over the lazy dog")
96
- * );
97
- *
98
- * console.log(encodeSparse(vocab, tokenize("the brown dog jumps")));
99
- * // [ 0, 2, 4, 7 ]
100
- *
101
- * console.log(encodeSparse(vocab, tokenize("the lazy fox")));
102
- * // [ 0, 3, 6 ]
103
- * ```
104
- *
105
- * @param vocab
106
- * @param src
107
- */
108
- export declare const encodeSparse: (vocab: Vocab, src: Iterable<string>) => number[];
109
- /**
110
- * Reverse op of {@link encodeDense}. Decodes dense multi-hot vector to extract
111
- * tokens from provided `vocab` (created via {@link defVocab}). The returned
112
- * array only contains the corresponding tokens of the vector's non-zero
113
- * components.
114
- *
115
- * @remarks
116
- * Also see {@link decodeSparse}.
117
- *
118
- * @example
119
- * ```ts tangle:../export/decode-dense.ts
120
- * import { defVocab, decodeDense, tokenize } from "@thi.ng/text-analysis";
121
- *
122
- * const vocab = defVocab(
123
- * tokenize("the quick brown fox jumps over the lazy dog")
124
- * );
125
- *
126
- * console.log(decodeDense(vocab, [1, 0, 1, 0, 1, 0, 0, 1]));
127
- * // [ "the", "brown", "jumps", "dog" ]
128
- *
129
- * console.log(decodeDense(vocab, [1, 0, 0, 1, 0, 0, 1, 0]));
130
- * // [ "the", "fox", "lazy" ]
131
- * ```
132
- *
133
- * @param vocab
134
- * @param src
135
- * @param sort
136
- */
137
- export declare const decodeDense: (vocab: Vocab, vec: Iterable<number>) => string[];
138
- /**
139
- * Reverse op of {@link encodeSparse}. Decodes sparse vector (created via
140
- * {@link encodeSparse} to extract tokens from provided `vocab` (created via
141
- * {@link defVocab}).
142
- *
143
- * @remarks
144
- * Also see {@link decodeDense}.
145
- *
146
- * @example
147
- * ```ts tangle:../export/decode-sparse.ts
148
- * import { defVocab, decodeSparse, tokenize } from "@thi.ng/text-analysis";
149
- *
150
- * const vocab = defVocab(
151
- * tokenize("the quick brown fox jumps over the lazy dog")
152
- * );
153
- *
154
- * console.log(decodeSparse(vocab, [0, 2, 4, 7]));
155
- * // [ "the", "brown", "jumps", "dog" ]
156
- *
157
- * console.log(decodeSparse(vocab, [0, 3, 6]));
158
- * // [ "the", "fox", "lazy" ]
159
- * ```
160
- *
161
- * @param vocab
162
- * @param src
163
- * @param sort
164
- */
165
- export declare const decodeSparse: (vocab: Vocab, vec: Iterable<number>) => string[];
166
53
  //# sourceMappingURL=vocab.d.ts.map
package/vocab.js CHANGED
@@ -1,31 +1,13 @@
1
- import {
2
- bidirIndexFromJSON,
3
- defBidirIndex
4
- } from "@thi.ng/bidir-index";
1
+ import { bidirIndexFromJSON, defBidirIndex } from "@thi.ng/bidir-index";
5
2
  import { isIterable } from "@thi.ng/checks/is-iterable";
3
+ import { isString } from "@thi.ng/checks/is-string";
4
+ import { mapcat } from "@thi.ng/transducers/mapcat";
6
5
  function defVocab(src, start) {
7
- return isIterable(src) ? defBidirIndex(src, { start }) : bidirIndexFromJSON(src);
6
+ return isIterable(src) ? defBidirIndex(
7
+ mapcat((x) => isString(x) ? [x] : x, src),
8
+ { start }
9
+ ) : bidirIndexFromJSON(src);
8
10
  }
9
- const encodeDense = (vocab, src) => {
10
- const vec = new Array(vocab.size).fill(0);
11
- for (let i of vocab.getAll(src)) vec[i] = 1;
12
- return vec;
13
- };
14
- const encodeSparse = (vocab, src) => [...vocab.getAllUnique(src)].sort((a, b) => a - b);
15
- const decodeDense = (vocab, vec) => {
16
- const res = [];
17
- let i = 0;
18
- for (let x of vec) {
19
- if (x) res.push(vocab.getID(i));
20
- i++;
21
- }
22
- return res;
23
- };
24
- const decodeSparse = (vocab, vec) => vocab.getAllIDs(vec);
25
11
  export {
26
- decodeDense,
27
- decodeSparse,
28
- defVocab,
29
- encodeDense,
30
- encodeSparse
12
+ defVocab
31
13
  };
package/xform.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import type { Vocab } from "./vocab.js";
1
+ import type { Vocab } from "./api.js";
2
2
  /**
3
3
  * Trnasducer to produce lowercase string.
4
4
  */