@thi.ng/text-analysis 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -1
- package/README.md +4 -1
- package/api.d.ts +4 -0
- package/api.js +0 -0
- package/cluster.d.ts +93 -0
- package/cluster.js +51 -0
- package/index.d.ts +3 -0
- package/index.js +3 -0
- package/package.json +29 -3
- package/similarity.d.ts +4 -4
- package/tf-idf.d.ts +101 -0
- package/tf-idf.js +63 -0
- package/vec.d.ts +150 -0
- package/vec.js +49 -0
- package/vocab.d.ts +4 -117
- package/vocab.js +8 -26
- package/xform.d.ts +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Change Log
|
|
2
2
|
|
|
3
|
-
- **Last updated**: 2025-06-
|
|
3
|
+
- **Last updated**: 2025-06-14T20:56:27Z
|
|
4
4
|
- **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
|
|
5
5
|
|
|
6
6
|
All notable changes to this project will be documented in this file.
|
|
@@ -11,6 +11,19 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
|
|
|
11
11
|
**Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
|
|
12
12
|
and/or version bumps of transitive dependencies.
|
|
13
13
|
|
|
14
|
+
## [0.2.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.2.0) (2025-06-14)
|
|
15
|
+
|
|
16
|
+
#### 🚀 Features
|
|
17
|
+
|
|
18
|
+
- add/migrate refactored tf-idf functions ([d311acc](https://github.com/thi-ng/umbrella/commit/d311acc))
|
|
19
|
+
- add/update vocab & vector encoding helpers, restructure ([9e4f60c](https://github.com/thi-ng/umbrella/commit/9e4f60c))
|
|
20
|
+
- add filterDocsIDF() ([f682b58](https://github.com/thi-ng/umbrella/commit/f682b58))
|
|
21
|
+
- add k-mean clustering fns ([3533843](https://github.com/thi-ng/umbrella/commit/3533843))
|
|
22
|
+
|
|
23
|
+
#### ♻️ Refactoring
|
|
24
|
+
|
|
25
|
+
- update imports/exports ([a44be87](https://github.com/thi-ng/umbrella/commit/a44be87))
|
|
26
|
+
|
|
14
27
|
## [0.1.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.1.0) (2025-06-09)
|
|
15
28
|
|
|
16
29
|
#### 🚀 Features
|
package/README.md
CHANGED
|
@@ -58,13 +58,16 @@ For Node.js REPL:
|
|
|
58
58
|
const ta = await import("@thi.ng/text-analysis");
|
|
59
59
|
```
|
|
60
60
|
|
|
61
|
-
Package sizes (brotli'd, pre-treeshake): ESM:
|
|
61
|
+
Package sizes (brotli'd, pre-treeshake): ESM: 3.30 KB
|
|
62
62
|
|
|
63
63
|
## Dependencies
|
|
64
64
|
|
|
65
65
|
- [@thi.ng/api](https://github.com/thi-ng/umbrella/tree/develop/packages/api)
|
|
66
|
+
- [@thi.ng/arrays](https://github.com/thi-ng/umbrella/tree/develop/packages/arrays)
|
|
66
67
|
- [@thi.ng/bidir-index](https://github.com/thi-ng/umbrella/tree/develop/packages/bidir-index)
|
|
67
68
|
- [@thi.ng/checks](https://github.com/thi-ng/umbrella/tree/develop/packages/checks)
|
|
69
|
+
- [@thi.ng/distance](https://github.com/thi-ng/umbrella/tree/develop/packages/distance)
|
|
70
|
+
- [@thi.ng/k-means](https://github.com/thi-ng/umbrella/tree/develop/packages/k-means)
|
|
68
71
|
- [@thi.ng/strings](https://github.com/thi-ng/umbrella/tree/develop/packages/strings)
|
|
69
72
|
- [@thi.ng/transducers](https://github.com/thi-ng/umbrella/tree/develop/packages/transducers)
|
|
70
73
|
- [@thi.ng/vectors](https://github.com/thi-ng/umbrella/tree/develop/packages/vectors)
|
package/api.d.ts
ADDED
package/api.js
ADDED
|
File without changes
|
package/cluster.d.ts
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { Untransformed } from "@thi.ng/distance/untransformed";
|
|
2
|
+
import { type KMeansOpts } from "@thi.ng/k-means";
|
|
3
|
+
import type { ReadonlyVec } from "@thi.ng/vectors";
|
|
4
|
+
import type { Vocab } from "./api.js";
|
|
5
|
+
/**
|
|
6
|
+
* Jaccard distance metric wrapper for {@link kmeansDense}
|
|
7
|
+
*/
|
|
8
|
+
export declare const JACCARD_DIST_DENSE: Untransformed<ReadonlyVec>;
|
|
9
|
+
/**
|
|
10
|
+
* k-means clustering for dense multi-hot vectors. Uses thi.ng/k-means for
|
|
11
|
+
* actual clustering and squared L2 as default distance metric.
|
|
12
|
+
* Default max. iterations = 100.
|
|
13
|
+
*
|
|
14
|
+
* @remarks
|
|
15
|
+
* Use {@link JACCARD_DIST_DENSE} for alternative distance metric.
|
|
16
|
+
*
|
|
17
|
+
* @param k
|
|
18
|
+
* @param docs
|
|
19
|
+
* @param opts
|
|
20
|
+
*/
|
|
21
|
+
export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partial<KMeansOpts>) => import("@thi.ng/k-means").Cluster[];
|
|
22
|
+
/**
|
|
23
|
+
* k-means clustering for sparse multi-hot vectors. First converts vectors into
|
|
24
|
+
* dense versions (using {@link toDense}), then calls {@link kmeansDense} to
|
|
25
|
+
* perform the clustering.
|
|
26
|
+
*
|
|
27
|
+
* @remarks
|
|
28
|
+
* Since sparse vector sizes vary, the number of dimensions used (aka the
|
|
29
|
+
* vocabulary size) MUST be given via `opts`.
|
|
30
|
+
*
|
|
31
|
+
* @param k
|
|
32
|
+
* @param docs
|
|
33
|
+
* @param opts
|
|
34
|
+
*/
|
|
35
|
+
export declare const kmeansSparse: (k: number, docs: ReadonlyVec[], opts: Partial<KMeansOpts> & {
|
|
36
|
+
dim: number;
|
|
37
|
+
}) => import("@thi.ng/k-means").Cluster[];
|
|
38
|
+
export declare function clusterBounds(docs: ReadonlyVec[]): {
|
|
39
|
+
centroid: ReadonlyVec;
|
|
40
|
+
radius: number;
|
|
41
|
+
};
|
|
42
|
+
export declare function clusterBounds(docs: ReadonlyVec[], ids: number[]): {
|
|
43
|
+
centroid: ReadonlyVec;
|
|
44
|
+
radius: number;
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* Takes a vocab and array of docs encoded as dense multi-hot vectors. Computes
|
|
48
|
+
* centroid of given docs and then calls {@link centralTermsVec} to return the
|
|
49
|
+
* `k`-most central terms (or less if there're insufficient non-zero vector
|
|
50
|
+
* components).
|
|
51
|
+
*
|
|
52
|
+
* @example
|
|
53
|
+
* ```ts tangle:../export/central-terms.ts
|
|
54
|
+
* import { centralTerms, encodeAllDense } from "@thi.ng/text-analysis";
|
|
55
|
+
*
|
|
56
|
+
* const inputs = [
|
|
57
|
+
* ["a", "b", "c"],
|
|
58
|
+
* ["a", "b", "d", "e"],
|
|
59
|
+
* ["b", "f", "g"],
|
|
60
|
+
* ["a", "b", "c", "f"],
|
|
61
|
+
* ["a", "g", "h"]
|
|
62
|
+
* ];
|
|
63
|
+
*
|
|
64
|
+
* // create vocab & encode documents into multi-hot vectors
|
|
65
|
+
* const { vocab, docs } = encodeAllDense(inputs);
|
|
66
|
+
*
|
|
67
|
+
* // extract top-4 common terms
|
|
68
|
+
* console.log(centralTerms(vocab, 4, docs));
|
|
69
|
+
* // [ "b", "a", "g", "f" ]
|
|
70
|
+
* ```
|
|
71
|
+
*
|
|
72
|
+
* @param vocab
|
|
73
|
+
* @param k
|
|
74
|
+
* @param docs
|
|
75
|
+
*/
|
|
76
|
+
export declare const centralTerms: (vocab: Vocab, k: number, docs: ReadonlyVec[]) => string[];
|
|
77
|
+
/**
|
|
78
|
+
* Takes a vocab and dense vector representing a point in the n-dimensional
|
|
79
|
+
* space of the given vocab. Returns an array of terms corresponding to the `k`
|
|
80
|
+
* largest non-zero components of the vector (or less if there're insufficient
|
|
81
|
+
* non-zero vector components).
|
|
82
|
+
*
|
|
83
|
+
* @remarks
|
|
84
|
+
* Also see {@link centralTerms} (incl. code example).
|
|
85
|
+
*
|
|
86
|
+
* @param vocab
|
|
87
|
+
* @param k
|
|
88
|
+
* @param centroid
|
|
89
|
+
*/
|
|
90
|
+
export declare const centralTermsVec: (vocab: Vocab, k: number, centroid: ReadonlyVec) => string[];
|
|
91
|
+
export declare const knearest: (query: ReadonlyVec, k: number, r?: number, dist?: Untransformed<ReadonlyVec>, sorted?: boolean) => import("@thi.ng/distance").KNearest<ReadonlyVec, unknown>;
|
|
92
|
+
export declare const knearestDocs: (query: ReadonlyVec, k: number, docs: ReadonlyVec[], r?: number, dist?: Untransformed<ReadonlyVec>, sorted?: boolean) => [ReadonlyVec, number][];
|
|
93
|
+
//# sourceMappingURL=cluster.d.ts.map
|
package/cluster.js
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { argSort } from "@thi.ng/arrays/arg-sort";
|
|
2
|
+
import { lookup } from "@thi.ng/arrays/lookup";
|
|
3
|
+
import { knearest as $knearest } from "@thi.ng/distance/knearest";
|
|
4
|
+
import { Untransformed } from "@thi.ng/distance/untransformed";
|
|
5
|
+
import { kmeans } from "@thi.ng/k-means";
|
|
6
|
+
import { map } from "@thi.ng/transducers/map";
|
|
7
|
+
import { max } from "@thi.ng/transducers/max";
|
|
8
|
+
import { transduce } from "@thi.ng/transducers/transduce";
|
|
9
|
+
import { distJaccard } from "@thi.ng/vectors/dist-jaccard";
|
|
10
|
+
import { distSq } from "@thi.ng/vectors/distsq";
|
|
11
|
+
import { mean } from "@thi.ng/vectors/mean";
|
|
12
|
+
import { toDense } from "./vec.js";
|
|
13
|
+
const JACCARD_DIST_DENSE = new Untransformed(distJaccard);
|
|
14
|
+
const kmeansDense = (k, docs, opts) => kmeans(k, docs, { maxIter: 100, ...opts });
|
|
15
|
+
const kmeansSparse = (k, docs, opts) => kmeansDense(
|
|
16
|
+
k,
|
|
17
|
+
docs.map((x) => toDense(opts.dim, x)),
|
|
18
|
+
opts
|
|
19
|
+
);
|
|
20
|
+
function clusterBounds(docs, ids) {
|
|
21
|
+
if (ids) docs = lookup(docs, ids);
|
|
22
|
+
const centroid = mean([], docs);
|
|
23
|
+
return {
|
|
24
|
+
centroid,
|
|
25
|
+
radius: transduce(
|
|
26
|
+
map((x) => distSq(centroid, x)),
|
|
27
|
+
max(),
|
|
28
|
+
docs
|
|
29
|
+
)
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
const centralTerms = (vocab, k, docs) => centralTermsVec(vocab, k, mean([], docs));
|
|
33
|
+
const centralTermsVec = (vocab, k, centroid) => vocab.getAllIDs(
|
|
34
|
+
argSort(centroid, (a, b) => b - a).slice(0, k).filter(Boolean)
|
|
35
|
+
);
|
|
36
|
+
const knearest = (query, k, r = Infinity, dist = JACCARD_DIST_DENSE, sorted = false) => $knearest(query, k, r, dist, sorted);
|
|
37
|
+
const knearestDocs = (query, k, docs, r = Infinity, dist = JACCARD_DIST_DENSE, sorted = false) => {
|
|
38
|
+
const neighborhood = $knearest(query, k, r, dist, sorted);
|
|
39
|
+
for (let i = 0; i < docs.length; i++) neighborhood.consider(docs[i], i);
|
|
40
|
+
return neighborhood.deref().map((n) => [docs[n[1]], n[0]]);
|
|
41
|
+
};
|
|
42
|
+
export {
|
|
43
|
+
JACCARD_DIST_DENSE,
|
|
44
|
+
centralTerms,
|
|
45
|
+
centralTermsVec,
|
|
46
|
+
clusterBounds,
|
|
47
|
+
kmeansDense,
|
|
48
|
+
kmeansSparse,
|
|
49
|
+
knearest,
|
|
50
|
+
knearestDocs
|
|
51
|
+
};
|
package/index.d.ts
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
|
+
export * from "./cluster.js";
|
|
1
2
|
export * from "./frequencies.js";
|
|
2
3
|
export * from "./ngrams.js";
|
|
3
4
|
export * from "./replace.js";
|
|
4
5
|
export * from "./similarity.js";
|
|
5
6
|
export * from "./stem.js";
|
|
6
7
|
export * from "./stop-words.js";
|
|
8
|
+
export * from "./tf-idf.js";
|
|
7
9
|
export * from "./tokenize.js";
|
|
10
|
+
export * from "./vec.js";
|
|
8
11
|
export * from "./vocab.js";
|
|
9
12
|
export * from "./xform.js";
|
|
10
13
|
//# sourceMappingURL=index.d.ts.map
|
package/index.js
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
export * from "./cluster.js";
|
|
1
2
|
export * from "./frequencies.js";
|
|
2
3
|
export * from "./ngrams.js";
|
|
3
4
|
export * from "./replace.js";
|
|
4
5
|
export * from "./similarity.js";
|
|
5
6
|
export * from "./stem.js";
|
|
6
7
|
export * from "./stop-words.js";
|
|
8
|
+
export * from "./tf-idf.js";
|
|
7
9
|
export * from "./tokenize.js";
|
|
10
|
+
export * from "./vec.js";
|
|
8
11
|
export * from "./vocab.js";
|
|
9
12
|
export * from "./xform.js";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@thi.ng/text-analysis",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"module": "./index.js",
|
|
@@ -40,10 +40,13 @@
|
|
|
40
40
|
},
|
|
41
41
|
"dependencies": {
|
|
42
42
|
"@thi.ng/api": "^8.11.29",
|
|
43
|
+
"@thi.ng/arrays": "^2.12.0",
|
|
43
44
|
"@thi.ng/bidir-index": "^1.3.0",
|
|
44
45
|
"@thi.ng/checks": "^3.7.9",
|
|
46
|
+
"@thi.ng/distance": "^3.0.0",
|
|
47
|
+
"@thi.ng/k-means": "^1.1.0",
|
|
45
48
|
"@thi.ng/strings": "^3.9.14",
|
|
46
|
-
"@thi.ng/transducers": "^9.4.
|
|
49
|
+
"@thi.ng/transducers": "^9.4.1",
|
|
47
50
|
"@thi.ng/vectors": "^8.3.0"
|
|
48
51
|
},
|
|
49
52
|
"devDependencies": {
|
|
@@ -52,17 +55,28 @@
|
|
|
52
55
|
"typescript": "^5.8.3"
|
|
53
56
|
},
|
|
54
57
|
"keywords": [
|
|
58
|
+
"analysis",
|
|
59
|
+
"centroid",
|
|
60
|
+
"cluster",
|
|
55
61
|
"composition",
|
|
56
62
|
"decode",
|
|
63
|
+
"dense",
|
|
57
64
|
"encode",
|
|
65
|
+
"frequency",
|
|
58
66
|
"functional",
|
|
67
|
+
"histogram",
|
|
68
|
+
"k-means",
|
|
59
69
|
"ngram",
|
|
60
70
|
"pipeline",
|
|
61
71
|
"similarity",
|
|
72
|
+
"sparse",
|
|
62
73
|
"stem",
|
|
63
74
|
"text",
|
|
75
|
+
"tf-idf",
|
|
64
76
|
"tokenizer",
|
|
77
|
+
"transducer",
|
|
65
78
|
"typescript",
|
|
79
|
+
"vocabulary",
|
|
66
80
|
"vector"
|
|
67
81
|
],
|
|
68
82
|
"publishConfig": {
|
|
@@ -83,6 +97,12 @@
|
|
|
83
97
|
".": {
|
|
84
98
|
"default": "./index.js"
|
|
85
99
|
},
|
|
100
|
+
"./api": {
|
|
101
|
+
"default": "./api.js"
|
|
102
|
+
},
|
|
103
|
+
"./cluster": {
|
|
104
|
+
"default": "./cluster.js"
|
|
105
|
+
},
|
|
86
106
|
"./frequencies": {
|
|
87
107
|
"default": "./frequencies.js"
|
|
88
108
|
},
|
|
@@ -101,9 +121,15 @@
|
|
|
101
121
|
"./stop-words": {
|
|
102
122
|
"default": "./stop-words.js"
|
|
103
123
|
},
|
|
124
|
+
"./tf-idf": {
|
|
125
|
+
"default": "./tf-idf.js"
|
|
126
|
+
},
|
|
104
127
|
"./tokenize": {
|
|
105
128
|
"default": "./tokenize.js"
|
|
106
129
|
},
|
|
130
|
+
"./vec": {
|
|
131
|
+
"default": "./vec.js"
|
|
132
|
+
},
|
|
107
133
|
"./vocab": {
|
|
108
134
|
"default": "./vocab.js"
|
|
109
135
|
},
|
|
@@ -115,5 +141,5 @@
|
|
|
115
141
|
"status": "alpha",
|
|
116
142
|
"year": 2021
|
|
117
143
|
},
|
|
118
|
-
"gitHead": "
|
|
144
|
+
"gitHead": "14e994e531d32053e948768998324d443436a542\n"
|
|
119
145
|
}
|
package/similarity.d.ts
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* @param a
|
|
9
9
|
* @param b
|
|
10
10
|
*/
|
|
11
|
-
export declare const cosineSimilarityDense: import("@thi.ng/vectors
|
|
11
|
+
export declare const cosineSimilarityDense: import("@thi.ng/vectors").DistanceFn;
|
|
12
12
|
/**
|
|
13
13
|
* Computes cosine similarity of given sparse multi-hot vectors.
|
|
14
14
|
*
|
|
@@ -26,7 +26,7 @@ export declare const cosineSimilaritySparse: (a: ArrayLike<number>, b: ArrayLike
|
|
|
26
26
|
* @param a
|
|
27
27
|
* @param b
|
|
28
28
|
*/
|
|
29
|
-
export declare const jaccardSimilarityDense: import("@thi.ng/vectors
|
|
29
|
+
export declare const jaccardSimilarityDense: import("@thi.ng/vectors").DistanceFn;
|
|
30
30
|
/**
|
|
31
31
|
* Computes Jaccard similarity of given sparse multi-hot vectors.
|
|
32
32
|
*
|
|
@@ -44,7 +44,7 @@ export declare const jaccardSimilaritySparse: (a: ArrayLike<number>, b: ArrayLik
|
|
|
44
44
|
* @param a
|
|
45
45
|
* @param b
|
|
46
46
|
*/
|
|
47
|
-
export declare const dotProductDense: import("@thi.ng/vectors
|
|
47
|
+
export declare const dotProductDense: import("@thi.ng/vectors").MultiVecOpRoVV<number>;
|
|
48
48
|
/**
|
|
49
49
|
* Computes dot product of the given sparse multi-hot vectors.
|
|
50
50
|
*
|
|
@@ -62,7 +62,7 @@ export declare const dotProductSparse: (a: ArrayLike<number>, b: ArrayLike<numbe
|
|
|
62
62
|
* @param a
|
|
63
63
|
* @param b
|
|
64
64
|
*/
|
|
65
|
-
export declare const distSqDense: import("@thi.ng/vectors
|
|
65
|
+
export declare const distSqDense: import("@thi.ng/vectors").MultiVecOpRoVV<number>;
|
|
66
66
|
/**
|
|
67
67
|
* Computes the squared L2 distance of the given sparse multi-hot vectors.
|
|
68
68
|
*
|
package/tf-idf.d.ts
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import type { Fn2 } from "@thi.ng/api";
|
|
2
|
+
import type { Vocab } from "./api.js";
|
|
3
|
+
export declare const tfCount: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
4
|
+
export declare const tfNormalized: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
5
|
+
export declare const tfLog: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
6
|
+
export declare const defIDF: (fnIDF: (count: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
7
|
+
export declare const idfClassic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
8
|
+
export declare const idfSmooth: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
9
|
+
export declare const idfProbabilistic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
10
|
+
/**
|
|
11
|
+
* Higher-order customizable tf-idf implementation, using provided fns for term
|
|
12
|
+
* frequency and inverse document frequency.
|
|
13
|
+
*
|
|
14
|
+
* @remarks
|
|
15
|
+
* See {@link tfidf} for default impl.
|
|
16
|
+
*
|
|
17
|
+
* Also see:
|
|
18
|
+
*
|
|
19
|
+
* - {@link tfCount}, {@link tfNormalized}, {@link tfLog}
|
|
20
|
+
* - {@link idfClassic}, {@link idfSmooth}, {@link idfProbabilistic}.
|
|
21
|
+
*
|
|
22
|
+
* References:
|
|
23
|
+
*
|
|
24
|
+
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
|
25
|
+
*
|
|
26
|
+
*
|
|
27
|
+
* @param fnTF
|
|
28
|
+
* @param fnIDF
|
|
29
|
+
*/
|
|
30
|
+
export declare const defTFIDF: (fnTF: Fn2<Vocab, string[], Map<string, number>>, fnIDF: Fn2<Vocab, string[][], Map<string, number>>) => (vocab: Vocab, tokenizedDocs: string[][]) => {
|
|
31
|
+
doc: string[];
|
|
32
|
+
tf: Map<string, number>;
|
|
33
|
+
idf: Map<string, number>;
|
|
34
|
+
tfidf: Map<string, number>;
|
|
35
|
+
}[];
|
|
36
|
+
/**
|
|
37
|
+
* Default tf-idf implementation, using {@link tfNormalized} for term
|
|
38
|
+
* frequency and {@link idfClassic} for inverse document frequency
|
|
39
|
+
* calculation.
|
|
40
|
+
*
|
|
41
|
+
* @remarks
|
|
42
|
+
* References:
|
|
43
|
+
*
|
|
44
|
+
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
|
45
|
+
*
|
|
46
|
+
* Also see {@link defTFIDF}
|
|
47
|
+
*
|
|
48
|
+
* @param vocab
|
|
49
|
+
* @param tokenizedDocs
|
|
50
|
+
*/
|
|
51
|
+
export declare const tfidf: (vocab: Vocab, tokenizedDocs: string[][]) => {
|
|
52
|
+
doc: string[];
|
|
53
|
+
tf: Map<string, number>;
|
|
54
|
+
idf: Map<string, number>;
|
|
55
|
+
tfidf: Map<string, number>;
|
|
56
|
+
}[];
|
|
57
|
+
/**
|
|
58
|
+
* Takes a vocab, an array of tokenized documents and a predicate function.
|
|
59
|
+
* Computes the IDF (Inverse Document Frequency, default: {@link idfClassic})
|
|
60
|
+
* and then filters each document using supplied predicate, which is called with
|
|
61
|
+
* a single word/token and its computed IDF. Only words are kept for which the
|
|
62
|
+
* predicate succeeds.
|
|
63
|
+
*
|
|
64
|
+
* @remarks
|
|
65
|
+
* The IDF for common words is close to zero. This function can be used as a
|
|
66
|
+
* pre-processing step for improved and more efficient vocabulary construction,
|
|
67
|
+
* vector encoding (e.g. via {@link encodeDense}), clustering etc. by pre-excluding
|
|
68
|
+
* tokens which do not contribute much information.
|
|
69
|
+
*
|
|
70
|
+
* @example
|
|
71
|
+
* ```ts tangle:../export/filter-docs-idf.ts
|
|
72
|
+
* import { filterDocsIDF } from "@thi.ng/text-analysis";
|
|
73
|
+
*
|
|
74
|
+
* const docs = [
|
|
75
|
+
* ["a", "b", "c"],
|
|
76
|
+
* ["a", "b", "d", "e"],
|
|
77
|
+
* ["b", "f", "g"],
|
|
78
|
+
* ["a", "b", "c", "f"],
|
|
79
|
+
* ["a", "g", "h"]
|
|
80
|
+
* ];
|
|
81
|
+
*
|
|
82
|
+
* // remove common words, i.e. those with an IDF below given threshold
|
|
83
|
+
* const filtered = filterDocsIDF(docs, (_, x) => x > 0.3);
|
|
84
|
+
*
|
|
85
|
+
* // show before & after
|
|
86
|
+
* for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]);
|
|
87
|
+
*
|
|
88
|
+
* // [ "a", "b", "c" ] => [ "c" ]
|
|
89
|
+
* // [ "a", "b", "d", "e" ] => [ "d", "e" ]
|
|
90
|
+
* // [ "b", "f", "g" ] => [ "f", "g" ]
|
|
91
|
+
* // [ "a", "b", "c", "f" ] => [ "c", "f" ]
|
|
92
|
+
* // [ "a", "g", "h" ] => [ "g", "h" ]
|
|
93
|
+
* ```
|
|
94
|
+
*
|
|
95
|
+
* @param docs
|
|
96
|
+
* @param pred
|
|
97
|
+
* @param vocab
|
|
98
|
+
* @param fnIDF
|
|
99
|
+
*/
|
|
100
|
+
export declare const filterDocsIDF: (docs: string[][], pred: Fn2<string, number, boolean>, vocab?: Vocab, fnIDF?: Fn2<Vocab, string[][], Map<string, number>>) => string[][];
|
|
101
|
+
//# sourceMappingURL=tf-idf.d.ts.map
|
package/tf-idf.js
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { transduce } from "@thi.ng/transducers/transduce";
|
|
2
|
+
import { frequencies, normFrequencies } from "./frequencies.js";
|
|
3
|
+
import { vocabOnly } from "./xform.js";
|
|
4
|
+
import { defVocab } from "./vocab.js";
|
|
5
|
+
const { log10 } = Math;
|
|
6
|
+
const tfCount = (vocab, docTokens) => transduce(vocabOnly(vocab), frequencies(), docTokens);
|
|
7
|
+
const tfNormalized = (vocab, docTokens) => transduce(vocabOnly(vocab), normFrequencies(), docTokens);
|
|
8
|
+
const tfLog = (vocab, docTokens) => {
|
|
9
|
+
const res = transduce(vocabOnly(vocab), frequencies(), docTokens);
|
|
10
|
+
for (const [word, count] of res) res.set(word, log10(1 + count));
|
|
11
|
+
return res;
|
|
12
|
+
};
|
|
13
|
+
const defIDF = (fnIDF) => (vocab, tokenizedDocs) => {
|
|
14
|
+
const acc = /* @__PURE__ */ new Map();
|
|
15
|
+
for (const word of vocab.keys()) {
|
|
16
|
+
let n = 0;
|
|
17
|
+
for (const doc of tokenizedDocs) {
|
|
18
|
+
if (doc.includes(word)) n++;
|
|
19
|
+
}
|
|
20
|
+
acc.set(word, fnIDF(n, tokenizedDocs.length));
|
|
21
|
+
}
|
|
22
|
+
return acc;
|
|
23
|
+
};
|
|
24
|
+
const idfClassic = defIDF(
|
|
25
|
+
(count, numDocs) => Math.log10(numDocs / count)
|
|
26
|
+
);
|
|
27
|
+
const idfSmooth = defIDF(
|
|
28
|
+
(count, numDocs) => 1 + log10(numDocs / (1 + count))
|
|
29
|
+
);
|
|
30
|
+
const idfProbabilistic = defIDF(
|
|
31
|
+
(count, numDocs) => log10((numDocs - count) / count)
|
|
32
|
+
);
|
|
33
|
+
const defTFIDF = (fnTF, fnIDF) => (vocab, tokenizedDocs) => {
|
|
34
|
+
const idf = fnIDF(vocab, tokenizedDocs);
|
|
35
|
+
return tokenizedDocs.map((doc) => {
|
|
36
|
+
const tf = fnTF(vocab, doc);
|
|
37
|
+
const acc = /* @__PURE__ */ new Map();
|
|
38
|
+
for (const [word, f] of tf) {
|
|
39
|
+
acc.set(word, f * idf.get(word));
|
|
40
|
+
}
|
|
41
|
+
return { doc, tf, idf, tfidf: acc };
|
|
42
|
+
});
|
|
43
|
+
};
|
|
44
|
+
const tfidf = defTFIDF(tfNormalized, idfClassic);
|
|
45
|
+
const filterDocsIDF = (docs, pred, vocab, fnIDF = idfClassic) => {
|
|
46
|
+
if (!vocab) vocab = defVocab(docs);
|
|
47
|
+
const idf = fnIDF(vocab, docs);
|
|
48
|
+
return docs.map(
|
|
49
|
+
(doc) => doc.filter((word) => vocab.has(word) && pred(word, idf.get(word)))
|
|
50
|
+
);
|
|
51
|
+
};
|
|
52
|
+
export {
|
|
53
|
+
defIDF,
|
|
54
|
+
defTFIDF,
|
|
55
|
+
filterDocsIDF,
|
|
56
|
+
idfClassic,
|
|
57
|
+
idfProbabilistic,
|
|
58
|
+
idfSmooth,
|
|
59
|
+
tfCount,
|
|
60
|
+
tfLog,
|
|
61
|
+
tfNormalized,
|
|
62
|
+
tfidf
|
|
63
|
+
};
|
package/vec.d.ts
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import type { ReadonlyVec } from "@thi.ng/vectors";
|
|
2
|
+
import type { Vocab } from "./api.js";
|
|
3
|
+
/**
|
|
4
|
+
* Encodes the given `doc` tokens into a dense multi-hot vector using provided
|
|
5
|
+
* `vocab` (e.g. created via {@link defVocab}). The vector size is the number of
|
|
6
|
+
* items in the vocab.
|
|
7
|
+
*
|
|
8
|
+
* @remarks
|
|
9
|
+
* Also see {@link encodeSparse}.
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```ts tangle:../export/encode-dense.ts
|
|
13
|
+
* import { defVocab, encodeDense, tokenize } from "@thi.ng/text-analysis";
|
|
14
|
+
*
|
|
15
|
+
* const vocab = defVocab(
|
|
16
|
+
* tokenize("the quick brown fox jumps over the lazy dog")
|
|
17
|
+
* );
|
|
18
|
+
*
|
|
19
|
+
* console.log(encodeDense(vocab, tokenize("the brown dog jumps")));
|
|
20
|
+
* // [ 1, 0, 1, 0, 1, 0, 0, 1 ]
|
|
21
|
+
*
|
|
22
|
+
* console.log(encodeDense(vocab, tokenize("the lazy fox")));
|
|
23
|
+
* // [ 1, 0, 0, 1, 0, 0, 1, 0 ]
|
|
24
|
+
* ```
|
|
25
|
+
*
|
|
26
|
+
* @param vocab
|
|
27
|
+
* @param doc
|
|
28
|
+
*/
|
|
29
|
+
export declare const encodeDense: (vocab: Vocab, doc: Iterable<string>) => ReadonlyVec;
|
|
30
|
+
/**
|
|
31
|
+
* Convenience function to create a vocabulary from given docs and encode each
|
|
32
|
+
* doc into a dense multi-hot vector (using {@link encodeDense}).
|
|
33
|
+
*
|
|
34
|
+
* @param docs
|
|
35
|
+
*/
|
|
36
|
+
export declare const encodeAllDense: (docs: string[][]) => {
|
|
37
|
+
vocab: Vocab;
|
|
38
|
+
docs: ReadonlyVec[];
|
|
39
|
+
};
|
|
40
|
+
/**
|
|
41
|
+
* Encodes the given `src` tokens into a sparse vector using provided `vocab`
|
|
42
|
+
* (created via {@link defVocab}). Only the IDs of matched tokens are stored.
|
|
43
|
+
* The returned vector size depends on the number of used/matched tokens, at
|
|
44
|
+
* most `vocab.size` (if entire vocab is used by `src`).
|
|
45
|
+
*
|
|
46
|
+
* @remarks
|
|
47
|
+
* Also see {@link encodeDense} for alternative encoding.
|
|
48
|
+
*
|
|
49
|
+
* @example
|
|
50
|
+
* ```ts tangle:../export/encode-sparse.ts
|
|
51
|
+
* import { defVocab, encodeSparse, tokenize } from "@thi.ng/text-analysis";
|
|
52
|
+
*
|
|
53
|
+
* const vocab = defVocab(
|
|
54
|
+
* tokenize("the quick brown fox jumps over the lazy dog")
|
|
55
|
+
* );
|
|
56
|
+
*
|
|
57
|
+
* console.log(encodeSparse(vocab, tokenize("the brown dog jumps")));
|
|
58
|
+
* // [ 0, 2, 4, 7 ]
|
|
59
|
+
*
|
|
60
|
+
* console.log(encodeSparse(vocab, tokenize("the lazy fox")));
|
|
61
|
+
* // [ 0, 3, 6 ]
|
|
62
|
+
* ```
|
|
63
|
+
*
|
|
64
|
+
* @param vocab
|
|
65
|
+
* @param src
|
|
66
|
+
*/
|
|
67
|
+
export declare const encodeSparse: (vocab: Vocab, src: Iterable<string>) => ReadonlyVec;
|
|
68
|
+
/**
|
|
69
|
+
* Convenience function to create a vocabulary from given docs and encode each
|
|
70
|
+
* doc into a sparse multi-hot vector (using {@link encodeSparse}).
|
|
71
|
+
*
|
|
72
|
+
* @param docs
|
|
73
|
+
*/
|
|
74
|
+
export declare const encodeAllSparse: (docs: string[][]) => {
|
|
75
|
+
vocab: Vocab;
|
|
76
|
+
docs: ReadonlyVec[];
|
|
77
|
+
};
|
|
78
|
+
/**
|
|
79
|
+
* Reverse op of {@link encodeDense}. Decodes dense multi-hot vector to extract
|
|
80
|
+
* tokens from provided `vocab` (created via {@link defVocab}). The returned
|
|
81
|
+
* array only contains the corresponding tokens of the vector's non-zero
|
|
82
|
+
* components.
|
|
83
|
+
*
|
|
84
|
+
* @remarks
|
|
85
|
+
* Also see {@link decodeSparse}.
|
|
86
|
+
*
|
|
87
|
+
* @example
|
|
88
|
+
* ```ts tangle:../export/decode-dense.ts
|
|
89
|
+
* import { defVocab, decodeDense, tokenize } from "@thi.ng/text-analysis";
|
|
90
|
+
*
|
|
91
|
+
* const vocab = defVocab(
|
|
92
|
+
* tokenize("the quick brown fox jumps over the lazy dog")
|
|
93
|
+
* );
|
|
94
|
+
*
|
|
95
|
+
* console.log(decodeDense(vocab, [1, 0, 1, 0, 1, 0, 0, 1]));
|
|
96
|
+
* // [ "the", "brown", "jumps", "dog" ]
|
|
97
|
+
*
|
|
98
|
+
* console.log(decodeDense(vocab, [1, 0, 0, 1, 0, 0, 1, 0]));
|
|
99
|
+
* // [ "the", "fox", "lazy" ]
|
|
100
|
+
* ```
|
|
101
|
+
*
|
|
102
|
+
* @param vocab
|
|
103
|
+
* @param src
|
|
104
|
+
* @param sort
|
|
105
|
+
*/
|
|
106
|
+
export declare const decodeDense: (vocab: Vocab, vec: Iterable<number>) => string[];
|
|
107
|
+
/**
|
|
108
|
+
* Reverse op of {@link encodeSparse}. Decodes sparse vector (created via
|
|
109
|
+
* {@link encodeSparse} to extract tokens from provided `vocab` (created via
|
|
110
|
+
* {@link defVocab}).
|
|
111
|
+
*
|
|
112
|
+
* @remarks
|
|
113
|
+
* Also see {@link decodeDense}.
|
|
114
|
+
*
|
|
115
|
+
* @example
|
|
116
|
+
* ```ts tangle:../export/decode-sparse.ts
|
|
117
|
+
* import { defVocab, decodeSparse, tokenize } from "@thi.ng/text-analysis";
|
|
118
|
+
*
|
|
119
|
+
* const vocab = defVocab(
|
|
120
|
+
* tokenize("the quick brown fox jumps over the lazy dog")
|
|
121
|
+
* );
|
|
122
|
+
*
|
|
123
|
+
* console.log(decodeSparse(vocab, [0, 2, 4, 7]));
|
|
124
|
+
* // [ "the", "brown", "jumps", "dog" ]
|
|
125
|
+
*
|
|
126
|
+
* console.log(decodeSparse(vocab, [0, 3, 6]));
|
|
127
|
+
* // [ "the", "fox", "lazy" ]
|
|
128
|
+
* ```
|
|
129
|
+
*
|
|
130
|
+
* @param vocab
|
|
131
|
+
* @param src
|
|
132
|
+
* @param sort
|
|
133
|
+
*/
|
|
134
|
+
export declare const decodeSparse: (vocab: Vocab, vec: Iterable<number>) => string[];
|
|
135
|
+
/**
|
|
136
|
+
* Converts given multi-hot sparse vector (e.g. created via {@link encodeSparse}
|
|
137
|
+
* into a dense representation.
|
|
138
|
+
*
|
|
139
|
+
* @param dim
|
|
140
|
+
* @param sparse
|
|
141
|
+
*/
|
|
142
|
+
export declare const toDense: (dim: number, sparse: ReadonlyVec) => ReadonlyVec;
|
|
143
|
+
/**
|
|
144
|
+
* Converts given multi-hot dense vector (e.g. created via {@link encodeDense})
|
|
145
|
+
* into a sparse representation.
|
|
146
|
+
*
|
|
147
|
+
* @param dense
|
|
148
|
+
*/
|
|
149
|
+
export declare const toSparse: (dense: ReadonlyVec) => ReadonlyVec;
|
|
150
|
+
//# sourceMappingURL=vec.d.ts.map
|
package/vec.js
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { defVocab } from "./vocab.js";
|
|
2
|
+
const encodeDense = (vocab, doc) => toDense(vocab.size, vocab.getAll(doc));
|
|
3
|
+
const encodeAllDense = (docs) => {
|
|
4
|
+
const vocab = defVocab(docs);
|
|
5
|
+
return {
|
|
6
|
+
vocab,
|
|
7
|
+
docs: docs.map((x) => encodeDense(vocab, x))
|
|
8
|
+
};
|
|
9
|
+
};
|
|
10
|
+
const encodeSparse = (vocab, src) => [...vocab.getAllUnique(src)].sort((a, b) => a - b);
|
|
11
|
+
const encodeAllSparse = (docs) => {
|
|
12
|
+
const vocab = defVocab(docs);
|
|
13
|
+
return {
|
|
14
|
+
vocab,
|
|
15
|
+
docs: docs.map((x) => encodeSparse(vocab, x))
|
|
16
|
+
};
|
|
17
|
+
};
|
|
18
|
+
const decodeDense = (vocab, vec) => {
|
|
19
|
+
const res = [];
|
|
20
|
+
let i = 0;
|
|
21
|
+
for (const x of vec) {
|
|
22
|
+
if (x) res.push(vocab.getID(i));
|
|
23
|
+
i++;
|
|
24
|
+
}
|
|
25
|
+
return res;
|
|
26
|
+
};
|
|
27
|
+
const decodeSparse = (vocab, vec) => vocab.getAllIDs(vec);
|
|
28
|
+
const toDense = (dim, sparse) => {
|
|
29
|
+
const res = new Array(dim).fill(0);
|
|
30
|
+
for (const i of sparse) res[i] = 1;
|
|
31
|
+
return res;
|
|
32
|
+
};
|
|
33
|
+
const toSparse = (dense) => {
|
|
34
|
+
const res = [];
|
|
35
|
+
for (let i = 0, n = dense.length; i < n; i++) {
|
|
36
|
+
if (dense[i]) res.push(i);
|
|
37
|
+
}
|
|
38
|
+
return res;
|
|
39
|
+
};
|
|
40
|
+
export {
|
|
41
|
+
decodeDense,
|
|
42
|
+
decodeSparse,
|
|
43
|
+
encodeAllDense,
|
|
44
|
+
encodeAllSparse,
|
|
45
|
+
encodeDense,
|
|
46
|
+
encodeSparse,
|
|
47
|
+
toDense,
|
|
48
|
+
toSparse
|
|
49
|
+
};
|
package/vocab.d.ts
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export type Vocab = BidirIndex<string>;
|
|
3
|
-
export type SerializedVocab = SerializedBidirIndex<string>;
|
|
1
|
+
import type { SerializedVocab, Vocab } from "./api.js";
|
|
4
2
|
/**
|
|
5
3
|
* Creates a bi-directional index storing unique tokens from given `src`
|
|
6
4
|
* iterable, optionally using custom `start` ID offset (default: 0). This index
|
|
7
|
-
* can then be used with {@link
|
|
5
|
+
* can then be used with {@link encodeDense}, {@link encodeSparse} and related
|
|
6
|
+
* functions.
|
|
8
7
|
*
|
|
9
8
|
* @remarks
|
|
10
9
|
* This function is syntax sugar for
|
|
@@ -43,7 +42,7 @@ export type SerializedVocab = SerializedBidirIndex<string>;
|
|
|
43
42
|
* @param src
|
|
44
43
|
* @param start
|
|
45
44
|
*/
|
|
46
|
-
export declare function defVocab(src: Iterable<string
|
|
45
|
+
export declare function defVocab(src: Iterable<string> | Iterable<string>[], start?: number): Vocab;
|
|
47
46
|
/**
|
|
48
47
|
* (Re)creates bi-directional vocab index from previous serialized state (e.g.
|
|
49
48
|
* via `vocab.toJSON()`).
|
|
@@ -51,116 +50,4 @@ export declare function defVocab(src: Iterable<string>, start?: number): Vocab;
|
|
|
51
50
|
* @param vocab
|
|
52
51
|
*/
|
|
53
52
|
export declare function defVocab(vocab: SerializedVocab): Vocab;
|
|
54
|
-
/**
|
|
55
|
-
* Encodes the given `src` tokens into a dense multi-hot vector using provided
|
|
56
|
-
* `vocab` (created via {@link defVocab}). The vector size is the number of
|
|
57
|
-
* items in the vocab.
|
|
58
|
-
*
|
|
59
|
-
* @remarks
|
|
60
|
-
* Also see {@link encodeSparse}.
|
|
61
|
-
*
|
|
62
|
-
* @example
|
|
63
|
-
* ```ts tangle:../export/encode-dense.ts
|
|
64
|
-
* import { defVocab, encodeDense, tokenize } from "@thi.ng/text-analysis";
|
|
65
|
-
*
|
|
66
|
-
* const vocab = defVocab(
|
|
67
|
-
* tokenize("the quick brown fox jumps over the lazy dog")
|
|
68
|
-
* );
|
|
69
|
-
*
|
|
70
|
-
* console.log(encodeDense(vocab, tokenize("the brown dog jumps")));
|
|
71
|
-
* // [ 1, 0, 1, 0, 1, 0, 0, 1 ]
|
|
72
|
-
*
|
|
73
|
-
* console.log(encodeDense(vocab, tokenize("the lazy fox")));
|
|
74
|
-
* // [ 1, 0, 0, 1, 0, 0, 1, 0 ]
|
|
75
|
-
* ```
|
|
76
|
-
*
|
|
77
|
-
* @param vocab
|
|
78
|
-
* @param src
|
|
79
|
-
*/
|
|
80
|
-
export declare const encodeDense: (vocab: Vocab, src: Iterable<string>) => any[];
|
|
81
|
-
/**
|
|
82
|
-
* Encodes the given `src` tokens into a sparse vector using provided `vocab`
|
|
83
|
-
* (created via {@link defVocab}). Only the IDs of matched tokens are stored.
|
|
84
|
-
* The returned vector size depends on the number of used/matched tokens, at
|
|
85
|
-
* most `vocab.size` (if entire vocab is used by `src`).
|
|
86
|
-
*
|
|
87
|
-
* @remarks
|
|
88
|
-
* Also see {@link encodeDense} for alternative encoding.
|
|
89
|
-
*
|
|
90
|
-
* @example
|
|
91
|
-
* ```ts tangle:../export/encode-sparse.ts
|
|
92
|
-
* import { defVocab, encodeSparse, tokenize } from "@thi.ng/text-analysis";
|
|
93
|
-
*
|
|
94
|
-
* const vocab = defVocab(
|
|
95
|
-
* tokenize("the quick brown fox jumps over the lazy dog")
|
|
96
|
-
* );
|
|
97
|
-
*
|
|
98
|
-
* console.log(encodeSparse(vocab, tokenize("the brown dog jumps")));
|
|
99
|
-
* // [ 0, 2, 4, 7 ]
|
|
100
|
-
*
|
|
101
|
-
* console.log(encodeSparse(vocab, tokenize("the lazy fox")));
|
|
102
|
-
* // [ 0, 3, 6 ]
|
|
103
|
-
* ```
|
|
104
|
-
*
|
|
105
|
-
* @param vocab
|
|
106
|
-
* @param src
|
|
107
|
-
*/
|
|
108
|
-
export declare const encodeSparse: (vocab: Vocab, src: Iterable<string>) => number[];
|
|
109
|
-
/**
|
|
110
|
-
* Reverse op of {@link encodeDense}. Decodes dense multi-hot vector to extract
|
|
111
|
-
* tokens from provided `vocab` (created via {@link defVocab}). The returned
|
|
112
|
-
* array only contains the corresponding tokens of the vector's non-zero
|
|
113
|
-
* components.
|
|
114
|
-
*
|
|
115
|
-
* @remarks
|
|
116
|
-
* Also see {@link decodeSparse}.
|
|
117
|
-
*
|
|
118
|
-
* @example
|
|
119
|
-
* ```ts tangle:../export/decode-dense.ts
|
|
120
|
-
* import { defVocab, decodeDense, tokenize } from "@thi.ng/text-analysis";
|
|
121
|
-
*
|
|
122
|
-
* const vocab = defVocab(
|
|
123
|
-
* tokenize("the quick brown fox jumps over the lazy dog")
|
|
124
|
-
* );
|
|
125
|
-
*
|
|
126
|
-
* console.log(decodeDense(vocab, [1, 0, 1, 0, 1, 0, 0, 1]));
|
|
127
|
-
* // [ "the", "brown", "jumps", "dog" ]
|
|
128
|
-
*
|
|
129
|
-
* console.log(decodeDense(vocab, [1, 0, 0, 1, 0, 0, 1, 0]));
|
|
130
|
-
* // [ "the", "fox", "lazy" ]
|
|
131
|
-
* ```
|
|
132
|
-
*
|
|
133
|
-
* @param vocab
|
|
134
|
-
* @param src
|
|
135
|
-
* @param sort
|
|
136
|
-
*/
|
|
137
|
-
export declare const decodeDense: (vocab: Vocab, vec: Iterable<number>) => string[];
|
|
138
|
-
/**
|
|
139
|
-
* Reverse op of {@link encodeSparse}. Decodes sparse vector (created via
|
|
140
|
-
* {@link encodeSparse} to extract tokens from provided `vocab` (created via
|
|
141
|
-
* {@link defVocab}).
|
|
142
|
-
*
|
|
143
|
-
* @remarks
|
|
144
|
-
* Also see {@link decodeDense}.
|
|
145
|
-
*
|
|
146
|
-
* @example
|
|
147
|
-
* ```ts tangle:../export/decode-sparse.ts
|
|
148
|
-
* import { defVocab, decodeSparse, tokenize } from "@thi.ng/text-analysis";
|
|
149
|
-
*
|
|
150
|
-
* const vocab = defVocab(
|
|
151
|
-
* tokenize("the quick brown fox jumps over the lazy dog")
|
|
152
|
-
* );
|
|
153
|
-
*
|
|
154
|
-
* console.log(decodeSparse(vocab, [0, 2, 4, 7]));
|
|
155
|
-
* // [ "the", "brown", "jumps", "dog" ]
|
|
156
|
-
*
|
|
157
|
-
* console.log(decodeSparse(vocab, [0, 3, 6]));
|
|
158
|
-
* // [ "the", "fox", "lazy" ]
|
|
159
|
-
* ```
|
|
160
|
-
*
|
|
161
|
-
* @param vocab
|
|
162
|
-
* @param src
|
|
163
|
-
* @param sort
|
|
164
|
-
*/
|
|
165
|
-
export declare const decodeSparse: (vocab: Vocab, vec: Iterable<number>) => string[];
|
|
166
53
|
//# sourceMappingURL=vocab.d.ts.map
|
package/vocab.js
CHANGED
|
@@ -1,31 +1,13 @@
|
|
|
1
|
-
import {
|
|
2
|
-
bidirIndexFromJSON,
|
|
3
|
-
defBidirIndex
|
|
4
|
-
} from "@thi.ng/bidir-index";
|
|
1
|
+
import { bidirIndexFromJSON, defBidirIndex } from "@thi.ng/bidir-index";
|
|
5
2
|
import { isIterable } from "@thi.ng/checks/is-iterable";
|
|
3
|
+
import { isString } from "@thi.ng/checks/is-string";
|
|
4
|
+
import { mapcat } from "@thi.ng/transducers/mapcat";
|
|
6
5
|
function defVocab(src, start) {
|
|
7
|
-
return isIterable(src) ? defBidirIndex(
|
|
6
|
+
return isIterable(src) ? defBidirIndex(
|
|
7
|
+
mapcat((x) => isString(x) ? [x] : x, src),
|
|
8
|
+
{ start }
|
|
9
|
+
) : bidirIndexFromJSON(src);
|
|
8
10
|
}
|
|
9
|
-
const encodeDense = (vocab, src) => {
|
|
10
|
-
const vec = new Array(vocab.size).fill(0);
|
|
11
|
-
for (let i of vocab.getAll(src)) vec[i] = 1;
|
|
12
|
-
return vec;
|
|
13
|
-
};
|
|
14
|
-
const encodeSparse = (vocab, src) => [...vocab.getAllUnique(src)].sort((a, b) => a - b);
|
|
15
|
-
const decodeDense = (vocab, vec) => {
|
|
16
|
-
const res = [];
|
|
17
|
-
let i = 0;
|
|
18
|
-
for (let x of vec) {
|
|
19
|
-
if (x) res.push(vocab.getID(i));
|
|
20
|
-
i++;
|
|
21
|
-
}
|
|
22
|
-
return res;
|
|
23
|
-
};
|
|
24
|
-
const decodeSparse = (vocab, vec) => vocab.getAllIDs(vec);
|
|
25
11
|
export {
|
|
26
|
-
|
|
27
|
-
decodeSparse,
|
|
28
|
-
defVocab,
|
|
29
|
-
encodeDense,
|
|
30
|
-
encodeSparse
|
|
12
|
+
defVocab
|
|
31
13
|
};
|
package/xform.d.ts
CHANGED