@thi.ng/text-analysis 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -1
- package/README.md +85 -2
- package/cluster.d.ts +12 -2
- package/cluster.js +4 -1
- package/index.d.ts +1 -0
- package/index.js +1 -0
- package/package.json +2 -2
- package/tf-idf.d.ts +47 -2
- package/tf-idf.js +6 -6
package/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Change Log
|
|
2
2
|
|
|
3
|
-
- **Last updated**: 2025-06-
|
|
3
|
+
- **Last updated**: 2025-06-15T18:41:17Z
|
|
4
4
|
- **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
|
|
5
5
|
|
|
6
6
|
All notable changes to this project will be documented in this file.
|
|
@@ -11,6 +11,19 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
|
|
|
11
11
|
**Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
|
|
12
12
|
and/or version bumps of transitive dependencies.
|
|
13
13
|
|
|
14
|
+
### [0.3.1](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.1) (2025-06-15)
|
|
15
|
+
|
|
16
|
+
#### 🩹 Bug fixes
|
|
17
|
+
|
|
18
|
+
- update pkg exports ([ea72b9f](https://github.com/thi-ng/umbrella/commit/ea72b9f))
|
|
19
|
+
|
|
20
|
+
## [0.3.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.0) (2025-06-15)
|
|
21
|
+
|
|
22
|
+
#### 🚀 Features
|
|
23
|
+
|
|
24
|
+
- update kmeansDense ([d35b6bd](https://github.com/thi-ng/umbrella/commit/d35b6bd))
|
|
25
|
+
- update results to include original `docs` for each cluster
|
|
26
|
+
|
|
14
27
|
## [0.2.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.2.0) (2025-06-14)
|
|
15
28
|
|
|
16
29
|
#### 🚀 Features
|
package/README.md
CHANGED
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
- [Installation](#installation)
|
|
20
20
|
- [Dependencies](#dependencies)
|
|
21
21
|
- [API](#api)
|
|
22
|
+
- [Code example](#code-example)
|
|
22
23
|
- [Authors](#authors)
|
|
23
24
|
- [License](#license)
|
|
24
25
|
|
|
@@ -58,7 +59,7 @@ For Node.js REPL:
|
|
|
58
59
|
const ta = await import("@thi.ng/text-analysis");
|
|
59
60
|
```
|
|
60
61
|
|
|
61
|
-
Package sizes (brotli'd, pre-treeshake): ESM: 3.
|
|
62
|
+
Package sizes (brotli'd, pre-treeshake): ESM: 3.33 KB
|
|
62
63
|
|
|
63
64
|
## Dependencies
|
|
64
65
|
|
|
@@ -78,7 +79,89 @@ Note: @thi.ng/api is in _most_ cases a type-only import (not used at runtime)
|
|
|
78
79
|
|
|
79
80
|
[Generated API docs](https://docs.thi.ng/umbrella/text-analysis/)
|
|
80
81
|
|
|
81
|
-
|
|
82
|
+
### Code example
|
|
83
|
+
|
|
84
|
+
```ts tangle:export/readme-1.ts
|
|
85
|
+
import { files, readJSON } from "@thi.ng/file-io";
|
|
86
|
+
import {
|
|
87
|
+
centralTerms,
|
|
88
|
+
encodeAllDense,
|
|
89
|
+
filterDocsIDF,
|
|
90
|
+
JACCARD_DIST_DENSE,
|
|
91
|
+
kmeansDense,
|
|
92
|
+
sortedFrequencies,
|
|
93
|
+
} from "@thi.ng/text-analysis";
|
|
94
|
+
|
|
95
|
+
// read package files of all ~210 umbrella libraries
|
|
96
|
+
const packages = [...files("packages", "package.json")].map((file) => {
|
|
97
|
+
const { name, keywords = [] } = readJSON(file);
|
|
98
|
+
return { id: name, tags: keywords };
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
// remove tags from each package which are too common and don't contribute
|
|
102
|
+
// meaningful information (using inverse document frequency)
|
|
103
|
+
const filteredTags = filterDocsIDF(
|
|
104
|
+
packages.map((x) => x.tags),
|
|
105
|
+
// filter predicate using arbitrary threshold
|
|
106
|
+
(_, idf) => idf > 1
|
|
107
|
+
);
|
|
108
|
+
|
|
109
|
+
// create an index of all remaining unique tags (vocab) and use this index to
|
|
110
|
+
// encode each package's tags as dense multi-hot vectors
|
|
111
|
+
const { vocab: allTags, docs: encodedPkgs } = encodeAllDense(filteredTags);
|
|
112
|
+
|
|
113
|
+
// show index/vocab size. all document vectors have this size/dimensionality
|
|
114
|
+
console.log("unique tags", allTags.size);
|
|
115
|
+
// unique tags 747
|
|
116
|
+
|
|
117
|
+
// show the top 10 tags used across all packages
|
|
118
|
+
console.log("top 10 tags:", centralTerms(allTags, 10, encodedPkgs));
|
|
119
|
+
// top 10 tags: [
|
|
120
|
+
// "iterator", "canvas", "typedarray", "hiccup", "tree",
|
|
121
|
+
// "graph", "parser", "codegen", "random", "vector"
|
|
122
|
+
// ]
|
|
123
|
+
|
|
124
|
+
// alternative approach (using a reducer) to extract top 10 tags with counts
|
|
125
|
+
console.log(
|
|
126
|
+
"sorted freq:",
|
|
127
|
+
sortedFrequencies(filteredTags.flat()).slice(0, 10)
|
|
128
|
+
);
|
|
129
|
+
// sorted freq: [
|
|
130
|
+
// ["iterator", 20], ["canvas", 20], ["typedarray", 19], ["tree", 18], ["hiccup", 18],
|
|
131
|
+
// ["graph", 17], ["parser", 16], ["codegen", 16], ["vector", 15], ["random", 15]
|
|
132
|
+
// ]
|
|
133
|
+
|
|
134
|
+
// cluster packages using k-means with Jaccard distance metric
|
|
135
|
+
const clusters = kmeansDense(20, encodedPkgs, { dist: JACCARD_DIST_DENSE });
|
|
136
|
+
|
|
137
|
+
// display cluster info
|
|
138
|
+
for (let { id, docs, items } of clusters) {
|
|
139
|
+
console.log(`cluster #${id} size: ${docs.length}`);
|
|
140
|
+
console.log(`top 5 tags:`, centralTerms(allTags, 5, docs));
|
|
141
|
+
console.log(`pkgs:`, items.map((i) => packages[i].id).join(", "));
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// cluster #0 size: 10
|
|
145
|
+
// top 5 tags: [ "color", "image", "rgb", "palette", "css" ]
|
|
146
|
+
// pkgs: @thi.ng/blurhash, @thi.ng/color, @thi.ng/color-palettes, @thi.ng/hdiff, @thi.ng/imago,
|
|
147
|
+
// @thi.ng/meta-css, @thi.ng/pixel, @thi.ng/pixel-analysis, @thi.ng/pixel-dominant-colors
|
|
148
|
+
// @thi.ng/porter-duff
|
|
149
|
+
//
|
|
150
|
+
// cluster #1 size: 10
|
|
151
|
+
// top 5 tags: [ "vector", "simulation", "time", "physics", "interpolation" ]
|
|
152
|
+
// pkgs: @thi.ng/boids, @thi.ng/cellular, @thi.ng/dlogic, @thi.ng/dual-algebra,
|
|
153
|
+
// @thi.ng/pixel-flow, @thi.ng/text-analysis, @thi.ng/timestep, @thi.ng/vclock,
|
|
154
|
+
// @thi.ng/vectors, @thi.ng/wasm-api-schedule
|
|
155
|
+
//
|
|
156
|
+
// cluster #2 size: 19
|
|
157
|
+
// top 5 tags: [ "canvas", "shader", "webgl", "shader-ast", "codegen" ]
|
|
158
|
+
// pkgs: @thi.ng/canvas, @thi.ng/dl-asset, @thi.ng/hdom-canvas, @thi.ng/hiccup-css,
|
|
159
|
+
// @thi.ng/hiccup-html-parse, @thi.ng/imgui, @thi.ng/layout, @thi.ng/mime,
|
|
160
|
+
// @thi.ng/rdom-canvas, @thi.ng/scenegraph, @thi.ng/shader-ast, @thi.ng/shader-ast-glsl,
|
|
161
|
+
// @thi.ng/shader-ast-js, @thi.ng/shader-ast-optimize, @thi.ng/wasm-api-canvas,
|
|
162
|
+
// @thi.ng/wasm-api-webgl, @thi.ng/webgl, @thi.ng/webgl-msdf, @thi.ng/webgl-shadertoy
|
|
163
|
+
// ...
|
|
164
|
+
```
|
|
82
165
|
|
|
83
166
|
## Authors
|
|
84
167
|
|
package/cluster.d.ts
CHANGED
|
@@ -18,7 +18,12 @@ export declare const JACCARD_DIST_DENSE: Untransformed<ReadonlyVec>;
|
|
|
18
18
|
* @param docs
|
|
19
19
|
* @param opts
|
|
20
20
|
*/
|
|
21
|
-
export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partial<KMeansOpts>) =>
|
|
21
|
+
export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partial<KMeansOpts>) => {
|
|
22
|
+
docs: ReadonlyVec[];
|
|
23
|
+
id: number;
|
|
24
|
+
centroid: ReadonlyVec;
|
|
25
|
+
items: number[];
|
|
26
|
+
}[];
|
|
22
27
|
/**
|
|
23
28
|
* k-means clustering for sparse multi-hot vectors. First converts vectors into
|
|
24
29
|
* dense versions (using {@link toDense}), then calls {@link kmeansDense} to
|
|
@@ -34,7 +39,12 @@ export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partia
|
|
|
34
39
|
*/
|
|
35
40
|
export declare const kmeansSparse: (k: number, docs: ReadonlyVec[], opts: Partial<KMeansOpts> & {
|
|
36
41
|
dim: number;
|
|
37
|
-
}) =>
|
|
42
|
+
}) => {
|
|
43
|
+
docs: ReadonlyVec[];
|
|
44
|
+
id: number;
|
|
45
|
+
centroid: ReadonlyVec;
|
|
46
|
+
items: number[];
|
|
47
|
+
}[];
|
|
38
48
|
export declare function clusterBounds(docs: ReadonlyVec[]): {
|
|
39
49
|
centroid: ReadonlyVec;
|
|
40
50
|
radius: number;
|
package/cluster.js
CHANGED
|
@@ -11,7 +11,10 @@ import { distSq } from "@thi.ng/vectors/distsq";
|
|
|
11
11
|
import { mean } from "@thi.ng/vectors/mean";
|
|
12
12
|
import { toDense } from "./vec.js";
|
|
13
13
|
const JACCARD_DIST_DENSE = new Untransformed(distJaccard);
|
|
14
|
-
const kmeansDense = (k, docs, opts) => kmeans(k, docs, { maxIter: 100, ...opts })
|
|
14
|
+
const kmeansDense = (k, docs, opts) => kmeans(k, docs, { maxIter: 100, ...opts }).map((cluster) => ({
|
|
15
|
+
...cluster,
|
|
16
|
+
docs: lookup(docs, cluster.items)
|
|
17
|
+
}));
|
|
15
18
|
const kmeansSparse = (k, docs, opts) => kmeansDense(
|
|
16
19
|
k,
|
|
17
20
|
docs.map((x) => toDense(opts.dim, x)),
|
package/index.d.ts
CHANGED
package/index.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@thi.ng/text-analysis",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"module": "./index.js",
|
|
@@ -141,5 +141,5 @@
|
|
|
141
141
|
"status": "alpha",
|
|
142
142
|
"year": 2021
|
|
143
143
|
},
|
|
144
|
-
"gitHead": "
|
|
144
|
+
"gitHead": "2e3adc4a5b737e21da697d6e935150c0855844dc\n"
|
|
145
145
|
}
|
package/tf-idf.d.ts
CHANGED
|
@@ -1,11 +1,56 @@
|
|
|
1
1
|
import type { Fn2 } from "@thi.ng/api";
|
|
2
2
|
import type { Vocab } from "./api.js";
|
|
3
|
+
/**
|
|
4
|
+
* TF weighting function for {@link defTFIDF}. Computes {@link frequencies} for
|
|
5
|
+
* given words/tokens (only includes those defined in `vocab`).
|
|
6
|
+
*/
|
|
3
7
|
export declare const tfCount: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
8
|
+
/**
|
|
9
|
+
* TF weighting function for {@link defTFIDF}. Computes {@link normFrequencies}
|
|
10
|
+
* for given words/tokens (only includes those defined in `vocab`).
|
|
11
|
+
*/
|
|
4
12
|
export declare const tfNormalized: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
13
|
+
/**
|
|
14
|
+
* TF weighting function for {@link defTFIDF}. First computes
|
|
15
|
+
* {@link frequencies} for given words/tokens (only includes those defined in
|
|
16
|
+
* `vocab`), then transforms each value via `log10(1 + count)`.
|
|
17
|
+
*/
|
|
5
18
|
export declare const tfLog: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
6
|
-
|
|
19
|
+
/**
|
|
20
|
+
* Higher order Inverse Document Frequency, using provided weighting strategy
|
|
21
|
+
* function.
|
|
22
|
+
*
|
|
23
|
+
* @remarks
|
|
24
|
+
* Also see {@link defTFIDF} for full tf-idf implementation.
|
|
25
|
+
*
|
|
26
|
+
* References:
|
|
27
|
+
*
|
|
28
|
+
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
|
29
|
+
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency
|
|
30
|
+
*
|
|
31
|
+
* Provided IDF impls for use with this function:
|
|
32
|
+
*
|
|
33
|
+
* - {@link idfClassic}
|
|
34
|
+
* - {@link idfSmooth}
|
|
35
|
+
* - {@link idfProbabilistic}
|
|
36
|
+
*
|
|
37
|
+
* @param fnIDF
|
|
38
|
+
*/
|
|
39
|
+
export declare const defIDF: (fnIDF: (docsWithTerm: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
40
|
+
/**
|
|
41
|
+
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
|
|
42
|
+
* `log10(numDocs / docsWithTerm)`
|
|
43
|
+
*/
|
|
7
44
|
export declare const idfClassic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
45
|
+
/**
|
|
46
|
+
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
|
|
47
|
+
* `1 + log10(numDocs / (1 + docsWithTerm))`
|
|
48
|
+
*/
|
|
8
49
|
export declare const idfSmooth: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
50
|
+
/**
|
|
51
|
+
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
|
|
52
|
+
* `log10((numDocs - docsWithTerm) / docsWithTerm)`
|
|
53
|
+
*/
|
|
9
54
|
export declare const idfProbabilistic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
10
55
|
/**
|
|
11
56
|
* Higher-order customizable tf-idf implementation, using provided fns for term
|
|
@@ -43,7 +88,7 @@ export declare const defTFIDF: (fnTF: Fn2<Vocab, string[], Map<string, number>>,
|
|
|
43
88
|
*
|
|
44
89
|
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
|
45
90
|
*
|
|
46
|
-
* Also see {@link defTFIDF}
|
|
91
|
+
* Also see {@link defTFIDF}, {@link defIDF}.
|
|
47
92
|
*
|
|
48
93
|
* @param vocab
|
|
49
94
|
* @param tokenizedDocs
|
package/tf-idf.js
CHANGED
|
@@ -13,22 +13,22 @@ const tfLog = (vocab, docTokens) => {
|
|
|
13
13
|
const defIDF = (fnIDF) => (vocab, tokenizedDocs) => {
|
|
14
14
|
const acc = /* @__PURE__ */ new Map();
|
|
15
15
|
for (const word of vocab.keys()) {
|
|
16
|
-
let
|
|
16
|
+
let count = 0;
|
|
17
17
|
for (const doc of tokenizedDocs) {
|
|
18
|
-
if (doc.includes(word))
|
|
18
|
+
if (doc.includes(word)) count++;
|
|
19
19
|
}
|
|
20
|
-
acc.set(word, fnIDF(
|
|
20
|
+
acc.set(word, fnIDF(count, tokenizedDocs.length));
|
|
21
21
|
}
|
|
22
22
|
return acc;
|
|
23
23
|
};
|
|
24
24
|
const idfClassic = defIDF(
|
|
25
|
-
(
|
|
25
|
+
(docsWithTerm, numDocs) => log10(numDocs / docsWithTerm)
|
|
26
26
|
);
|
|
27
27
|
const idfSmooth = defIDF(
|
|
28
|
-
(
|
|
28
|
+
(docsWithTerm, numDocs) => 1 + log10(numDocs / (1 + docsWithTerm))
|
|
29
29
|
);
|
|
30
30
|
const idfProbabilistic = defIDF(
|
|
31
|
-
(
|
|
31
|
+
(docsWithTerm, numDocs) => log10((numDocs - docsWithTerm) / docsWithTerm)
|
|
32
32
|
);
|
|
33
33
|
const defTFIDF = (fnTF, fnIDF) => (vocab, tokenizedDocs) => {
|
|
34
34
|
const idf = fnIDF(vocab, tokenizedDocs);
|