@thi.ng/text-analysis 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -1
- package/README.md +85 -2
- package/frequencies.d.ts +40 -0
- package/frequencies.js +10 -0
- package/index.d.ts +1 -0
- package/index.js +1 -0
- package/package.json +8 -8
- package/tf-idf.d.ts +47 -2
- package/tf-idf.js +6 -6
package/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Change Log
|
|
2
2
|
|
|
3
|
-
- **Last updated**: 2025-06-
|
|
3
|
+
- **Last updated**: 2025-06-18T12:01:21Z
|
|
4
4
|
- **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
|
|
5
5
|
|
|
6
6
|
All notable changes to this project will be documented in this file.
|
|
@@ -11,6 +11,23 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
|
|
|
11
11
|
**Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
|
|
12
12
|
and/or version bumps of transitive dependencies.
|
|
13
13
|
|
|
14
|
+
## [0.4.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.4.0) (2025-06-18)
|
|
15
|
+
|
|
16
|
+
#### 🚀 Features
|
|
17
|
+
|
|
18
|
+
- add `filterDocsFrequency()` ([6ac1f90](https://github.com/thi-ng/umbrella/commit/6ac1f90))
|
|
19
|
+
|
|
20
|
+
#### ⏱ Performance improvements
|
|
21
|
+
|
|
22
|
+
- minor update kmeansDense() ([ebd5618](https://github.com/thi-ng/umbrella/commit/ebd5618))
|
|
23
|
+
- internal use `lookupUnsafe()`
|
|
24
|
+
|
|
25
|
+
### [0.3.1](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.1) (2025-06-15)
|
|
26
|
+
|
|
27
|
+
#### 🩹 Bug fixes
|
|
28
|
+
|
|
29
|
+
- update pkg exports ([ea72b9f](https://github.com/thi-ng/umbrella/commit/ea72b9f))
|
|
30
|
+
|
|
14
31
|
## [0.3.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.0) (2025-06-15)
|
|
15
32
|
|
|
16
33
|
#### 🚀 Features
|
package/README.md
CHANGED
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
- [Installation](#installation)
|
|
20
20
|
- [Dependencies](#dependencies)
|
|
21
21
|
- [API](#api)
|
|
22
|
+
- [Code example](#code-example)
|
|
22
23
|
- [Authors](#authors)
|
|
23
24
|
- [License](#license)
|
|
24
25
|
|
|
@@ -58,7 +59,7 @@ For Node.js REPL:
|
|
|
58
59
|
const ta = await import("@thi.ng/text-analysis");
|
|
59
60
|
```
|
|
60
61
|
|
|
61
|
-
Package sizes (brotli'd, pre-treeshake): ESM: 3.
|
|
62
|
+
Package sizes (brotli'd, pre-treeshake): ESM: 3.37 KB
|
|
62
63
|
|
|
63
64
|
## Dependencies
|
|
64
65
|
|
|
@@ -78,7 +79,89 @@ Note: @thi.ng/api is in _most_ cases a type-only import (not used at runtime)
|
|
|
78
79
|
|
|
79
80
|
[Generated API docs](https://docs.thi.ng/umbrella/text-analysis/)
|
|
80
81
|
|
|
81
|
-
|
|
82
|
+
### Code example
|
|
83
|
+
|
|
84
|
+
```ts tangle:export/readme-1.ts
|
|
85
|
+
import { files, readJSON } from "@thi.ng/file-io";
|
|
86
|
+
import {
|
|
87
|
+
centralTerms,
|
|
88
|
+
encodeAllDense,
|
|
89
|
+
filterDocsIDF,
|
|
90
|
+
JACCARD_DIST_DENSE,
|
|
91
|
+
kmeansDense,
|
|
92
|
+
sortedFrequencies,
|
|
93
|
+
} from "@thi.ng/text-analysis";
|
|
94
|
+
|
|
95
|
+
// read package files of all ~210 umbrella libraries
|
|
96
|
+
const packages = [...files("packages", "package.json")].map((file) => {
|
|
97
|
+
const { name, keywords = [] } = readJSON(file);
|
|
98
|
+
return { id: name, tags: keywords };
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
// remove tags from each package which are too common and don't contribute
|
|
102
|
+
// meaningful information (using inverse document frequency)
|
|
103
|
+
const filteredTags = filterDocsIDF(
|
|
104
|
+
packages.map((x) => x.tags),
|
|
105
|
+
// filter predicate using arbitrary threshold
|
|
106
|
+
(_, idf) => idf > 1
|
|
107
|
+
);
|
|
108
|
+
|
|
109
|
+
// create an index of all remaining unique tags (vocab) and use this index to
|
|
110
|
+
// encode each package's tags as dense multi-hot vectors
|
|
111
|
+
const { vocab: allTags, docs: encodedPkgs } = encodeAllDense(filteredTags);
|
|
112
|
+
|
|
113
|
+
// show index/vocab size. all document vectors have this size/dimensionality
|
|
114
|
+
console.log("unique tags", allTags.size);
|
|
115
|
+
// unique tags 747
|
|
116
|
+
|
|
117
|
+
// show the top 10 tags used across all packages
|
|
118
|
+
console.log("top 10 tags:", centralTerms(allTags, 10, encodedPkgs));
|
|
119
|
+
// top 10 tags: [
|
|
120
|
+
// "iterator", "canvas", "typedarray", "hiccup", "tree",
|
|
121
|
+
// "graph", "parser", "codegen", "random", "vector"
|
|
122
|
+
// ]
|
|
123
|
+
|
|
124
|
+
// alternative approach (using a reducer) to extract top 10 tags with counts
|
|
125
|
+
console.log(
|
|
126
|
+
"sorted freq:",
|
|
127
|
+
sortedFrequencies(filteredTags.flat()).slice(0, 10)
|
|
128
|
+
);
|
|
129
|
+
// sorted freq: [
|
|
130
|
+
// ["iterator", 20], ["canvas", 20], ["typedarray", 19], ["tree", 18], ["hiccup", 18],
|
|
131
|
+
// ["graph", 17], ["parser", 16], ["codegen", 16], ["vector", 15], ["random", 15]
|
|
132
|
+
// ]
|
|
133
|
+
|
|
134
|
+
// cluster packages using k-means with Jaccard distance metric
|
|
135
|
+
const clusters = kmeansDense(20, encodedPkgs, { dist: JACCARD_DIST_DENSE });
|
|
136
|
+
|
|
137
|
+
// display cluster info
|
|
138
|
+
for (let { id, docs, items } of clusters) {
|
|
139
|
+
console.log(`cluster #${id} size: ${docs.length}`);
|
|
140
|
+
console.log(`top 5 tags:`, centralTerms(allTags, 5, docs));
|
|
141
|
+
console.log(`pkgs:`, items.map((i) => packages[i].id).join(", "));
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// cluster #0 size: 10
|
|
145
|
+
// top 5 tags: [ "color", "image", "rgb", "palette", "css" ]
|
|
146
|
+
// pkgs: @thi.ng/blurhash, @thi.ng/color, @thi.ng/color-palettes, @thi.ng/hdiff, @thi.ng/imago,
|
|
147
|
+
// @thi.ng/meta-css, @thi.ng/pixel, @thi.ng/pixel-analysis, @thi.ng/pixel-dominant-colors
|
|
148
|
+
// @thi.ng/porter-duff
|
|
149
|
+
//
|
|
150
|
+
// cluster #1 size: 10
|
|
151
|
+
// top 5 tags: [ "vector", "simulation", "time", "physics", "interpolation" ]
|
|
152
|
+
// pkgs: @thi.ng/boids, @thi.ng/cellular, @thi.ng/dlogic, @thi.ng/dual-algebra,
|
|
153
|
+
// @thi.ng/pixel-flow, @thi.ng/text-analysis, @thi.ng/timestep, @thi.ng/vclock,
|
|
154
|
+
// @thi.ng/vectors, @thi.ng/wasm-api-schedule
|
|
155
|
+
//
|
|
156
|
+
// cluster #2 size: 19
|
|
157
|
+
// top 5 tags: [ "canvas", "shader", "webgl", "shader-ast", "codegen" ]
|
|
158
|
+
// pkgs: @thi.ng/canvas, @thi.ng/dl-asset, @thi.ng/hdom-canvas, @thi.ng/hiccup-css,
|
|
159
|
+
// @thi.ng/hiccup-html-parse, @thi.ng/imgui, @thi.ng/layout, @thi.ng/mime,
|
|
160
|
+
// @thi.ng/rdom-canvas, @thi.ng/scenegraph, @thi.ng/shader-ast, @thi.ng/shader-ast-glsl,
|
|
161
|
+
// @thi.ng/shader-ast-js, @thi.ng/shader-ast-optimize, @thi.ng/wasm-api-canvas,
|
|
162
|
+
// @thi.ng/wasm-api-webgl, @thi.ng/webgl, @thi.ng/webgl-msdf, @thi.ng/webgl-shadertoy
|
|
163
|
+
// ...
|
|
164
|
+
```
|
|
82
165
|
|
|
83
166
|
## Authors
|
|
84
167
|
|
package/frequencies.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { Fn, Fn2 } from "@thi.ng/api";
|
|
1
2
|
import { frequencies as $freq } from "@thi.ng/transducers/frequencies";
|
|
2
3
|
import { normFrequenciesAuto as $norm } from "@thi.ng/transducers/norm-frequencies-auto";
|
|
3
4
|
import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequencies";
|
|
@@ -59,4 +60,43 @@ export declare const normFrequencies: typeof $norm;
|
|
|
59
60
|
* ```
|
|
60
61
|
*/
|
|
61
62
|
export declare const sortedFrequencies: typeof $sorted;
|
|
63
|
+
/**
|
|
64
|
+
* Takes an array of tokenized documents, a histogram function (`frequencies`)
|
|
65
|
+
* and a predicate function (`pred`). First computes the combined histogram of
|
|
66
|
+
* terms/works in all given docs using `frequencies`, then filters each document
|
|
67
|
+
* using supplied predicate, which is called with a single word/token and its
|
|
68
|
+
* computed frequency. Only words are kept for which the predicate succeeds.
|
|
69
|
+
*
|
|
70
|
+
* @remarks
|
|
71
|
+
* See {@link frequencies} and {@link normFrequencies} for histogram creation.
|
|
72
|
+
*
|
|
73
|
+
* @example
|
|
74
|
+
* ```ts tangle:../export/filter-docs-frequency.ts
|
|
75
|
+
* import { filterDocsFrequency, frequencies } from "@thi.ng/text-analysis";
|
|
76
|
+
*
|
|
77
|
+
* const docs = [
|
|
78
|
+
* ["a", "b", "c"],
|
|
79
|
+
* ["a", "b", "d", "e"],
|
|
80
|
+
* ["b", "f", "g"],
|
|
81
|
+
* ["a", "b", "c", "f"],
|
|
82
|
+
* ["a", "g", "h"]
|
|
83
|
+
* ];
|
|
84
|
+
*
|
|
85
|
+
* // only keep words which occur more than once
|
|
86
|
+
* const filtered = filterDocsFrequency(docs, frequencies, (_, x) => x > 1);
|
|
87
|
+
*
|
|
88
|
+
* // show before & after
|
|
89
|
+
* for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]);
|
|
90
|
+
* // [ "a", "b", "c" ] => [ "a", "b", "c" ]
|
|
91
|
+
* // [ "a", "b", "d", "e" ] => [ "a", "b" ]
|
|
92
|
+
* // [ "b", "f", "g" ] => [ "b", "f", "g" ]
|
|
93
|
+
* // [ "a", "b", "c", "f" ] => [ "a", "b", "c", "f" ]
|
|
94
|
+
* // [ "a", "g", "h" ] => [ "a", "g" ]
|
|
95
|
+
* ```
|
|
96
|
+
*
|
|
97
|
+
* @param docs
|
|
98
|
+
* @param frequencies
|
|
99
|
+
* @param pred
|
|
100
|
+
*/
|
|
101
|
+
export declare const filterDocsFrequency: (docs: string[][], frequencies: Fn<Iterable<string>, Map<string, number>>, pred: Fn2<string, number, boolean>) => string[][];
|
|
62
102
|
//# sourceMappingURL=frequencies.d.ts.map
|
package/frequencies.js
CHANGED
|
@@ -4,7 +4,17 @@ import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequen
|
|
|
4
4
|
const frequencies = $freq;
|
|
5
5
|
const normFrequencies = $norm;
|
|
6
6
|
const sortedFrequencies = $sorted;
|
|
7
|
+
const filterDocsFrequency = (docs, frequencies2, pred) => {
|
|
8
|
+
const histogram = frequencies2(docs.flat());
|
|
9
|
+
return docs.map(
|
|
10
|
+
(doc) => doc.filter((word) => {
|
|
11
|
+
const freq = histogram.get(word);
|
|
12
|
+
return freq !== void 0 && pred(word, freq);
|
|
13
|
+
})
|
|
14
|
+
);
|
|
15
|
+
};
|
|
7
16
|
export {
|
|
17
|
+
filterDocsFrequency,
|
|
8
18
|
frequencies,
|
|
9
19
|
normFrequencies,
|
|
10
20
|
sortedFrequencies
|
package/index.d.ts
CHANGED
package/index.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@thi.ng/text-analysis",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"module": "./index.js",
|
|
@@ -40,14 +40,14 @@
|
|
|
40
40
|
},
|
|
41
41
|
"dependencies": {
|
|
42
42
|
"@thi.ng/api": "^8.11.29",
|
|
43
|
-
"@thi.ng/arrays": "^2.
|
|
43
|
+
"@thi.ng/arrays": "^2.13.0",
|
|
44
44
|
"@thi.ng/bidir-index": "^1.3.0",
|
|
45
45
|
"@thi.ng/checks": "^3.7.9",
|
|
46
|
-
"@thi.ng/distance": "^3.0.
|
|
47
|
-
"@thi.ng/k-means": "^1.1.
|
|
48
|
-
"@thi.ng/strings": "^3.9.
|
|
49
|
-
"@thi.ng/transducers": "^9.4.
|
|
50
|
-
"@thi.ng/vectors": "^8.3.
|
|
46
|
+
"@thi.ng/distance": "^3.0.1",
|
|
47
|
+
"@thi.ng/k-means": "^1.1.1",
|
|
48
|
+
"@thi.ng/strings": "^3.9.15",
|
|
49
|
+
"@thi.ng/transducers": "^9.4.2",
|
|
50
|
+
"@thi.ng/vectors": "^8.3.1"
|
|
51
51
|
},
|
|
52
52
|
"devDependencies": {
|
|
53
53
|
"esbuild": "^0.25.5",
|
|
@@ -141,5 +141,5 @@
|
|
|
141
141
|
"status": "alpha",
|
|
142
142
|
"year": 2021
|
|
143
143
|
},
|
|
144
|
-
"gitHead": "
|
|
144
|
+
"gitHead": "b076434a497b291ad33e81b1a15f6a71e2c82cc2\n"
|
|
145
145
|
}
|
package/tf-idf.d.ts
CHANGED
|
@@ -1,11 +1,56 @@
|
|
|
1
1
|
import type { Fn2 } from "@thi.ng/api";
|
|
2
2
|
import type { Vocab } from "./api.js";
|
|
3
|
+
/**
|
|
4
|
+
* TF weighting function for {@link defTFIDF}. Computes {@link frequencies} for
|
|
5
|
+
* given words/tokens (only includes those defined in `vocab`).
|
|
6
|
+
*/
|
|
3
7
|
export declare const tfCount: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
8
|
+
/**
|
|
9
|
+
* TF weighting function for {@link defTFIDF}. Computes {@link normFrequencies}
|
|
10
|
+
* for given words/tokens (only includes those defined in `vocab`).
|
|
11
|
+
*/
|
|
4
12
|
export declare const tfNormalized: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
13
|
+
/**
|
|
14
|
+
* TF weighting function for {@link defTFIDF}. First computes
|
|
15
|
+
* {@link frequencies} for given words/tokens (only includes those defined in
|
|
16
|
+
* `vocab`), then transforms each value via `log10(1 + count)`.
|
|
17
|
+
*/
|
|
5
18
|
export declare const tfLog: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
|
|
6
|
-
|
|
19
|
+
/**
|
|
20
|
+
* Higher order Inverse Document Frequency, using provided weighting strategy
|
|
21
|
+
* function.
|
|
22
|
+
*
|
|
23
|
+
* @remarks
|
|
24
|
+
* Also see {@link defTFIDF} for full tf-idf implementation.
|
|
25
|
+
*
|
|
26
|
+
* References:
|
|
27
|
+
*
|
|
28
|
+
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
|
29
|
+
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency
|
|
30
|
+
*
|
|
31
|
+
* Provided IDF impls for use with this function:
|
|
32
|
+
*
|
|
33
|
+
* - {@link idfClassic}
|
|
34
|
+
* - {@link idfSmooth}
|
|
35
|
+
* - {@link idfProbabilistic}
|
|
36
|
+
*
|
|
37
|
+
* @param fnIDF
|
|
38
|
+
*/
|
|
39
|
+
export declare const defIDF: (fnIDF: (docsWithTerm: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
40
|
+
/**
|
|
41
|
+
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
|
|
42
|
+
* `log10(numDocs / docsWithTerm)`
|
|
43
|
+
*/
|
|
7
44
|
export declare const idfClassic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
45
|
+
/**
|
|
46
|
+
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
|
|
47
|
+
* `1 + log10(numDocs / (1 + docsWithTerm))`
|
|
48
|
+
*/
|
|
8
49
|
export declare const idfSmooth: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
50
|
+
/**
|
|
51
|
+
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
|
|
52
|
+
* `log10((numDocs - docsWithTerm) / docsWithTerm)`
|
|
53
|
+
*/
|
|
9
54
|
export declare const idfProbabilistic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
|
|
10
55
|
/**
|
|
11
56
|
* Higher-order customizable tf-idf implementation, using provided fns for term
|
|
@@ -43,7 +88,7 @@ export declare const defTFIDF: (fnTF: Fn2<Vocab, string[], Map<string, number>>,
|
|
|
43
88
|
*
|
|
44
89
|
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
|
45
90
|
*
|
|
46
|
-
* Also see {@link defTFIDF}
|
|
91
|
+
* Also see {@link defTFIDF}, {@link defIDF}.
|
|
47
92
|
*
|
|
48
93
|
* @param vocab
|
|
49
94
|
* @param tokenizedDocs
|
package/tf-idf.js
CHANGED
|
@@ -13,22 +13,22 @@ const tfLog = (vocab, docTokens) => {
|
|
|
13
13
|
const defIDF = (fnIDF) => (vocab, tokenizedDocs) => {
|
|
14
14
|
const acc = /* @__PURE__ */ new Map();
|
|
15
15
|
for (const word of vocab.keys()) {
|
|
16
|
-
let
|
|
16
|
+
let count = 0;
|
|
17
17
|
for (const doc of tokenizedDocs) {
|
|
18
|
-
if (doc.includes(word))
|
|
18
|
+
if (doc.includes(word)) count++;
|
|
19
19
|
}
|
|
20
|
-
acc.set(word, fnIDF(
|
|
20
|
+
acc.set(word, fnIDF(count, tokenizedDocs.length));
|
|
21
21
|
}
|
|
22
22
|
return acc;
|
|
23
23
|
};
|
|
24
24
|
const idfClassic = defIDF(
|
|
25
|
-
(
|
|
25
|
+
(docsWithTerm, numDocs) => log10(numDocs / docsWithTerm)
|
|
26
26
|
);
|
|
27
27
|
const idfSmooth = defIDF(
|
|
28
|
-
(
|
|
28
|
+
(docsWithTerm, numDocs) => 1 + log10(numDocs / (1 + docsWithTerm))
|
|
29
29
|
);
|
|
30
30
|
const idfProbabilistic = defIDF(
|
|
31
|
-
(
|
|
31
|
+
(docsWithTerm, numDocs) => log10((numDocs - docsWithTerm) / docsWithTerm)
|
|
32
32
|
);
|
|
33
33
|
const defTFIDF = (fnTF, fnIDF) => (vocab, tokenizedDocs) => {
|
|
34
34
|
const idf = fnIDF(vocab, tokenizedDocs);
|