@thi.ng/text-analysis 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Change Log
2
2
 
3
- - **Last updated**: 2025-06-15T12:37:24Z
3
+ - **Last updated**: 2025-06-18T12:01:21Z
4
4
  - **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
5
5
 
6
6
  All notable changes to this project will be documented in this file.
@@ -11,6 +11,23 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
11
11
  **Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
12
12
  and/or version bumps of transitive dependencies.
13
13
 
14
+ ## [0.4.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.4.0) (2025-06-18)
15
+
16
+ #### 🚀 Features
17
+
18
+ - add `filterDocsFrequency()` ([6ac1f90](https://github.com/thi-ng/umbrella/commit/6ac1f90))
19
+
20
+ #### ⏱ Performance improvements
21
+
22
+ - minor update kmeansDense() ([ebd5618](https://github.com/thi-ng/umbrella/commit/ebd5618))
23
+ - internal use `lookupUnsafe()`
24
+
25
+ ### [0.3.1](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.1) (2025-06-15)
26
+
27
+ #### 🩹 Bug fixes
28
+
29
+ - update pkg exports ([ea72b9f](https://github.com/thi-ng/umbrella/commit/ea72b9f))
30
+
14
31
  ## [0.3.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.0) (2025-06-15)
15
32
 
16
33
  #### 🚀 Features
package/README.md CHANGED
@@ -19,6 +19,7 @@
19
19
  - [Installation](#installation)
20
20
  - [Dependencies](#dependencies)
21
21
  - [API](#api)
22
+ - [Code example](#code-example)
22
23
  - [Authors](#authors)
23
24
  - [License](#license)
24
25
 
@@ -58,7 +59,7 @@ For Node.js REPL:
58
59
  const ta = await import("@thi.ng/text-analysis");
59
60
  ```
60
61
 
61
- Package sizes (brotli'd, pre-treeshake): ESM: 3.30 KB
62
+ Package sizes (brotli'd, pre-treeshake): ESM: 3.37 KB
62
63
 
63
64
  ## Dependencies
64
65
 
@@ -78,7 +79,89 @@ Note: @thi.ng/api is in _most_ cases a type-only import (not used at runtime)
78
79
 
79
80
  [Generated API docs](https://docs.thi.ng/umbrella/text-analysis/)
80
81
 
81
- TODO
82
+ ### Code example
83
+
84
+ ```ts tangle:export/readme-1.ts
85
+ import { files, readJSON } from "@thi.ng/file-io";
86
+ import {
87
+ centralTerms,
88
+ encodeAllDense,
89
+ filterDocsIDF,
90
+ JACCARD_DIST_DENSE,
91
+ kmeansDense,
92
+ sortedFrequencies,
93
+ } from "@thi.ng/text-analysis";
94
+
95
+ // read package files of all ~210 umbrella libraries
96
+ const packages = [...files("packages", "package.json")].map((file) => {
97
+ const { name, keywords = [] } = readJSON(file);
98
+ return { id: name, tags: keywords };
99
+ });
100
+
101
+ // remove tags from each package which are too common and don't contribute
102
+ // meaningful information (using inverse document frequency)
103
+ const filteredTags = filterDocsIDF(
104
+ packages.map((x) => x.tags),
105
+ // filter predicate using arbitrary threshold
106
+ (_, idf) => idf > 1
107
+ );
108
+
109
+ // create an index of all remaining unique tags (vocab) and use this index to
110
+ // encode each package's tags as dense multi-hot vectors
111
+ const { vocab: allTags, docs: encodedPkgs } = encodeAllDense(filteredTags);
112
+
113
+ // show index/vocab size. all document vectors have this size/dimensionality
114
+ console.log("unique tags", allTags.size);
115
+ // unique tags 747
116
+
117
+ // show the top 10 tags used across all packages
118
+ console.log("top 10 tags:", centralTerms(allTags, 10, encodedPkgs));
119
+ // top 10 tags: [
120
+ // "iterator", "canvas", "typedarray", "hiccup", "tree",
121
+ // "graph", "parser", "codegen", "random", "vector"
122
+ // ]
123
+
124
+ // alternative approach (using a reducer) to extract top 10 tags with counts
125
+ console.log(
126
+ "sorted freq:",
127
+ sortedFrequencies(filteredTags.flat()).slice(0, 10)
128
+ );
129
+ // sorted freq: [
130
+ // ["iterator", 20], ["canvas", 20], ["typedarray", 19], ["tree", 18], ["hiccup", 18],
131
+ // ["graph", 17], ["parser", 16], ["codegen", 16], ["vector", 15], ["random", 15]
132
+ // ]
133
+
134
+ // cluster packages using k-means with Jaccard distance metric
135
+ const clusters = kmeansDense(20, encodedPkgs, { dist: JACCARD_DIST_DENSE });
136
+
137
+ // display cluster info
138
+ for (let { id, docs, items } of clusters) {
139
+ console.log(`cluster #${id} size: ${docs.length}`);
140
+ console.log(`top 5 tags:`, centralTerms(allTags, 5, docs));
141
+ console.log(`pkgs:`, items.map((i) => packages[i].id).join(", "));
142
+ }
143
+
144
+ // cluster #0 size: 10
145
+ // top 5 tags: [ "color", "image", "rgb", "palette", "css" ]
146
+ // pkgs: @thi.ng/blurhash, @thi.ng/color, @thi.ng/color-palettes, @thi.ng/hdiff, @thi.ng/imago,
147
+ // @thi.ng/meta-css, @thi.ng/pixel, @thi.ng/pixel-analysis, @thi.ng/pixel-dominant-colors
148
+ // @thi.ng/porter-duff
149
+ //
150
+ // cluster #1 size: 10
151
+ // top 5 tags: [ "vector", "simulation", "time", "physics", "interpolation" ]
152
+ // pkgs: @thi.ng/boids, @thi.ng/cellular, @thi.ng/dlogic, @thi.ng/dual-algebra,
153
+ // @thi.ng/pixel-flow, @thi.ng/text-analysis, @thi.ng/timestep, @thi.ng/vclock,
154
+ // @thi.ng/vectors, @thi.ng/wasm-api-schedule
155
+ //
156
+ // cluster #2 size: 19
157
+ // top 5 tags: [ "canvas", "shader", "webgl", "shader-ast", "codegen" ]
158
+ // pkgs: @thi.ng/canvas, @thi.ng/dl-asset, @thi.ng/hdom-canvas, @thi.ng/hiccup-css,
159
+ // @thi.ng/hiccup-html-parse, @thi.ng/imgui, @thi.ng/layout, @thi.ng/mime,
160
+ // @thi.ng/rdom-canvas, @thi.ng/scenegraph, @thi.ng/shader-ast, @thi.ng/shader-ast-glsl,
161
+ // @thi.ng/shader-ast-js, @thi.ng/shader-ast-optimize, @thi.ng/wasm-api-canvas,
162
+ // @thi.ng/wasm-api-webgl, @thi.ng/webgl, @thi.ng/webgl-msdf, @thi.ng/webgl-shadertoy
163
+ // ...
164
+ ```
82
165
 
83
166
  ## Authors
84
167
 
package/frequencies.d.ts CHANGED
@@ -1,3 +1,4 @@
1
+ import type { Fn, Fn2 } from "@thi.ng/api";
1
2
  import { frequencies as $freq } from "@thi.ng/transducers/frequencies";
2
3
  import { normFrequenciesAuto as $norm } from "@thi.ng/transducers/norm-frequencies-auto";
3
4
  import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequencies";
@@ -59,4 +60,43 @@ export declare const normFrequencies: typeof $norm;
59
60
  * ```
60
61
  */
61
62
  export declare const sortedFrequencies: typeof $sorted;
63
+ /**
64
+ * Takes an array of tokenized documents, a histogram function (`frequencies`)
65
+ * and a predicate function (`pred`). First computes the combined histogram of
66
+ * terms/works in all given docs using `frequencies`, then filters each document
67
+ * using supplied predicate, which is called with a single word/token and its
68
+ * computed frequency. Only words are kept for which the predicate succeeds.
69
+ *
70
+ * @remarks
71
+ * See {@link frequencies} and {@link normFrequencies} for histogram creation.
72
+ *
73
+ * @example
74
+ * ```ts tangle:../export/filter-docs-frequency.ts
75
+ * import { filterDocsFrequency, frequencies } from "@thi.ng/text-analysis";
76
+ *
77
+ * const docs = [
78
+ * ["a", "b", "c"],
79
+ * ["a", "b", "d", "e"],
80
+ * ["b", "f", "g"],
81
+ * ["a", "b", "c", "f"],
82
+ * ["a", "g", "h"]
83
+ * ];
84
+ *
85
+ * // only keep words which occur more than once
86
+ * const filtered = filterDocsFrequency(docs, frequencies, (_, x) => x > 1);
87
+ *
88
+ * // show before & after
89
+ * for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]);
90
+ * // [ "a", "b", "c" ] => [ "a", "b", "c" ]
91
+ * // [ "a", "b", "d", "e" ] => [ "a", "b" ]
92
+ * // [ "b", "f", "g" ] => [ "b", "f", "g" ]
93
+ * // [ "a", "b", "c", "f" ] => [ "a", "b", "c", "f" ]
94
+ * // [ "a", "g", "h" ] => [ "a", "g" ]
95
+ * ```
96
+ *
97
+ * @param docs
98
+ * @param frequencies
99
+ * @param pred
100
+ */
101
+ export declare const filterDocsFrequency: (docs: string[][], frequencies: Fn<Iterable<string>, Map<string, number>>, pred: Fn2<string, number, boolean>) => string[][];
62
102
  //# sourceMappingURL=frequencies.d.ts.map
package/frequencies.js CHANGED
@@ -4,7 +4,17 @@ import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequen
4
4
  const frequencies = $freq;
5
5
  const normFrequencies = $norm;
6
6
  const sortedFrequencies = $sorted;
7
+ const filterDocsFrequency = (docs, frequencies2, pred) => {
8
+ const histogram = frequencies2(docs.flat());
9
+ return docs.map(
10
+ (doc) => doc.filter((word) => {
11
+ const freq = histogram.get(word);
12
+ return freq !== void 0 && pred(word, freq);
13
+ })
14
+ );
15
+ };
7
16
  export {
17
+ filterDocsFrequency,
8
18
  frequencies,
9
19
  normFrequencies,
10
20
  sortedFrequencies
package/index.d.ts CHANGED
@@ -1,3 +1,4 @@
1
+ export * from "./api.js";
1
2
  export * from "./cluster.js";
2
3
  export * from "./frequencies.js";
3
4
  export * from "./ngrams.js";
package/index.js CHANGED
@@ -1,3 +1,4 @@
1
+ export * from "./api.js";
1
2
  export * from "./cluster.js";
2
3
  export * from "./frequencies.js";
3
4
  export * from "./ngrams.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@thi.ng/text-analysis",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "description": "Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities",
5
5
  "type": "module",
6
6
  "module": "./index.js",
@@ -40,14 +40,14 @@
40
40
  },
41
41
  "dependencies": {
42
42
  "@thi.ng/api": "^8.11.29",
43
- "@thi.ng/arrays": "^2.12.0",
43
+ "@thi.ng/arrays": "^2.13.0",
44
44
  "@thi.ng/bidir-index": "^1.3.0",
45
45
  "@thi.ng/checks": "^3.7.9",
46
- "@thi.ng/distance": "^3.0.0",
47
- "@thi.ng/k-means": "^1.1.0",
48
- "@thi.ng/strings": "^3.9.14",
49
- "@thi.ng/transducers": "^9.4.1",
50
- "@thi.ng/vectors": "^8.3.0"
46
+ "@thi.ng/distance": "^3.0.1",
47
+ "@thi.ng/k-means": "^1.1.1",
48
+ "@thi.ng/strings": "^3.9.15",
49
+ "@thi.ng/transducers": "^9.4.2",
50
+ "@thi.ng/vectors": "^8.3.1"
51
51
  },
52
52
  "devDependencies": {
53
53
  "esbuild": "^0.25.5",
@@ -141,5 +141,5 @@
141
141
  "status": "alpha",
142
142
  "year": 2021
143
143
  },
144
- "gitHead": "4635a24acc2623894887ca31189fdffda87ff9d3\n"
144
+ "gitHead": "b076434a497b291ad33e81b1a15f6a71e2c82cc2\n"
145
145
  }
package/tf-idf.d.ts CHANGED
@@ -1,11 +1,56 @@
1
1
  import type { Fn2 } from "@thi.ng/api";
2
2
  import type { Vocab } from "./api.js";
3
+ /**
4
+ * TF weighting function for {@link defTFIDF}. Computes {@link frequencies} for
5
+ * given words/tokens (only includes those defined in `vocab`).
6
+ */
3
7
  export declare const tfCount: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
8
+ /**
9
+ * TF weighting function for {@link defTFIDF}. Computes {@link normFrequencies}
10
+ * for given words/tokens (only includes those defined in `vocab`).
11
+ */
4
12
  export declare const tfNormalized: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
13
+ /**
14
+ * TF weighting function for {@link defTFIDF}. First computes
15
+ * {@link frequencies} for given words/tokens (only includes those defined in
16
+ * `vocab`), then transforms each value via `log10(1 + count)`.
17
+ */
5
18
  export declare const tfLog: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
6
- export declare const defIDF: (fnIDF: (count: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
19
+ /**
20
+ * Higher order Inverse Document Frequency, using provided weighting strategy
21
+ * function.
22
+ *
23
+ * @remarks
24
+ * Also see {@link defTFIDF} for full tf-idf implementation.
25
+ *
26
+ * References:
27
+ *
28
+ * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
29
+ * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency
30
+ *
31
+ * Provided IDF impls for use with this function:
32
+ *
33
+ * - {@link idfClassic}
34
+ * - {@link idfSmooth}
35
+ * - {@link idfProbabilistic}
36
+ *
37
+ * @param fnIDF
38
+ */
39
+ export declare const defIDF: (fnIDF: (docsWithTerm: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
40
+ /**
41
+ * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
42
+ * `log10(numDocs / docsWithTerm)`
43
+ */
7
44
  export declare const idfClassic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
45
+ /**
46
+ * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
47
+ * `1 + log10(numDocs / (1 + docsWithTerm))`
48
+ */
8
49
  export declare const idfSmooth: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
50
+ /**
51
+ * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
52
+ * `log10((numDocs - docsWithTerm) / docsWithTerm)`
53
+ */
9
54
  export declare const idfProbabilistic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
10
55
  /**
11
56
  * Higher-order customizable tf-idf implementation, using provided fns for term
@@ -43,7 +88,7 @@ export declare const defTFIDF: (fnTF: Fn2<Vocab, string[], Map<string, number>>,
43
88
  *
44
89
  * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
45
90
  *
46
- * Also see {@link defTFIDF}
91
+ * Also see {@link defTFIDF}, {@link defIDF}.
47
92
  *
48
93
  * @param vocab
49
94
  * @param tokenizedDocs
package/tf-idf.js CHANGED
@@ -13,22 +13,22 @@ const tfLog = (vocab, docTokens) => {
13
13
  const defIDF = (fnIDF) => (vocab, tokenizedDocs) => {
14
14
  const acc = /* @__PURE__ */ new Map();
15
15
  for (const word of vocab.keys()) {
16
- let n = 0;
16
+ let count = 0;
17
17
  for (const doc of tokenizedDocs) {
18
- if (doc.includes(word)) n++;
18
+ if (doc.includes(word)) count++;
19
19
  }
20
- acc.set(word, fnIDF(n, tokenizedDocs.length));
20
+ acc.set(word, fnIDF(count, tokenizedDocs.length));
21
21
  }
22
22
  return acc;
23
23
  };
24
24
  const idfClassic = defIDF(
25
- (count, numDocs) => Math.log10(numDocs / count)
25
+ (docsWithTerm, numDocs) => log10(numDocs / docsWithTerm)
26
26
  );
27
27
  const idfSmooth = defIDF(
28
- (count, numDocs) => 1 + log10(numDocs / (1 + count))
28
+ (docsWithTerm, numDocs) => 1 + log10(numDocs / (1 + docsWithTerm))
29
29
  );
30
30
  const idfProbabilistic = defIDF(
31
- (count, numDocs) => log10((numDocs - count) / count)
31
+ (docsWithTerm, numDocs) => log10((numDocs - docsWithTerm) / docsWithTerm)
32
32
  );
33
33
  const defTFIDF = (fnTF, fnIDF) => (vocab, tokenizedDocs) => {
34
34
  const idf = fnIDF(vocab, tokenizedDocs);