@thi.ng/text-analysis 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Change Log
2
2
 
3
- - **Last updated**: 2025-06-14T20:56:27Z
3
+ - **Last updated**: 2025-06-15T18:41:17Z
4
4
  - **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
5
5
 
6
6
  All notable changes to this project will be documented in this file.
@@ -11,6 +11,19 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
11
11
  **Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
12
12
  and/or version bumps of transitive dependencies.
13
13
 
14
+ ### [0.3.1](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.1) (2025-06-15)
15
+
16
+ #### 🩹 Bug fixes
17
+
18
+ - update pkg exports ([ea72b9f](https://github.com/thi-ng/umbrella/commit/ea72b9f))
19
+
20
+ ## [0.3.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.0) (2025-06-15)
21
+
22
+ #### 🚀 Features
23
+
24
+ - update kmeansDense ([d35b6bd](https://github.com/thi-ng/umbrella/commit/d35b6bd))
25
+ - update results to include original `docs` for each cluster
26
+
14
27
  ## [0.2.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.2.0) (2025-06-14)
15
28
 
16
29
  #### 🚀 Features
package/README.md CHANGED
@@ -19,6 +19,7 @@
19
19
  - [Installation](#installation)
20
20
  - [Dependencies](#dependencies)
21
21
  - [API](#api)
22
+ - [Code example](#code-example)
22
23
  - [Authors](#authors)
23
24
  - [License](#license)
24
25
 
@@ -58,7 +59,7 @@ For Node.js REPL:
58
59
  const ta = await import("@thi.ng/text-analysis");
59
60
  ```
60
61
 
61
- Package sizes (brotli'd, pre-treeshake): ESM: 3.30 KB
62
+ Package sizes (brotli'd, pre-treeshake): ESM: 3.33 KB
62
63
 
63
64
  ## Dependencies
64
65
 
@@ -78,7 +79,89 @@ Note: @thi.ng/api is in _most_ cases a type-only import (not used at runtime)
78
79
 
79
80
  [Generated API docs](https://docs.thi.ng/umbrella/text-analysis/)
80
81
 
81
- TODO
82
+ ### Code example
83
+
84
+ ```ts tangle:export/readme-1.ts
85
+ import { files, readJSON } from "@thi.ng/file-io";
86
+ import {
87
+ centralTerms,
88
+ encodeAllDense,
89
+ filterDocsIDF,
90
+ JACCARD_DIST_DENSE,
91
+ kmeansDense,
92
+ sortedFrequencies,
93
+ } from "@thi.ng/text-analysis";
94
+
95
+ // read package files of all ~210 umbrella libraries
96
+ const packages = [...files("packages", "package.json")].map((file) => {
97
+ const { name, keywords = [] } = readJSON(file);
98
+ return { id: name, tags: keywords };
99
+ });
100
+
101
+ // remove tags from each package which are too common and don't contribute
102
+ // meaningful information (using inverse document frequency)
103
+ const filteredTags = filterDocsIDF(
104
+ packages.map((x) => x.tags),
105
+ // filter predicate using arbitrary threshold
106
+ (_, idf) => idf > 1
107
+ );
108
+
109
+ // create an index of all remaining unique tags (vocab) and use this index to
110
+ // encode each package's tags as dense multi-hot vectors
111
+ const { vocab: allTags, docs: encodedPkgs } = encodeAllDense(filteredTags);
112
+
113
+ // show index/vocab size. all document vectors have this size/dimensionality
114
+ console.log("unique tags", allTags.size);
115
+ // unique tags 747
116
+
117
+ // show the top 10 tags used across all packages
118
+ console.log("top 10 tags:", centralTerms(allTags, 10, encodedPkgs));
119
+ // top 10 tags: [
120
+ // "iterator", "canvas", "typedarray", "hiccup", "tree",
121
+ // "graph", "parser", "codegen", "random", "vector"
122
+ // ]
123
+
124
+ // alternative approach (using a reducer) to extract top 10 tags with counts
125
+ console.log(
126
+ "sorted freq:",
127
+ sortedFrequencies(filteredTags.flat()).slice(0, 10)
128
+ );
129
+ // sorted freq: [
130
+ // ["iterator", 20], ["canvas", 20], ["typedarray", 19], ["tree", 18], ["hiccup", 18],
131
+ // ["graph", 17], ["parser", 16], ["codegen", 16], ["vector", 15], ["random", 15]
132
+ // ]
133
+
134
+ // cluster packages using k-means with Jaccard distance metric
135
+ const clusters = kmeansDense(20, encodedPkgs, { dist: JACCARD_DIST_DENSE });
136
+
137
+ // display cluster info
138
+ for (let { id, docs, items } of clusters) {
139
+ console.log(`cluster #${id} size: ${docs.length}`);
140
+ console.log(`top 5 tags:`, centralTerms(allTags, 5, docs));
141
+ console.log(`pkgs:`, items.map((i) => packages[i].id).join(", "));
142
+ }
143
+
144
+ // cluster #0 size: 10
145
+ // top 5 tags: [ "color", "image", "rgb", "palette", "css" ]
146
+ // pkgs: @thi.ng/blurhash, @thi.ng/color, @thi.ng/color-palettes, @thi.ng/hdiff, @thi.ng/imago,
147
+ // @thi.ng/meta-css, @thi.ng/pixel, @thi.ng/pixel-analysis, @thi.ng/pixel-dominant-colors
148
+ // @thi.ng/porter-duff
149
+ //
150
+ // cluster #1 size: 10
151
+ // top 5 tags: [ "vector", "simulation", "time", "physics", "interpolation" ]
152
+ // pkgs: @thi.ng/boids, @thi.ng/cellular, @thi.ng/dlogic, @thi.ng/dual-algebra,
153
+ // @thi.ng/pixel-flow, @thi.ng/text-analysis, @thi.ng/timestep, @thi.ng/vclock,
154
+ // @thi.ng/vectors, @thi.ng/wasm-api-schedule
155
+ //
156
+ // cluster #2 size: 19
157
+ // top 5 tags: [ "canvas", "shader", "webgl", "shader-ast", "codegen" ]
158
+ // pkgs: @thi.ng/canvas, @thi.ng/dl-asset, @thi.ng/hdom-canvas, @thi.ng/hiccup-css,
159
+ // @thi.ng/hiccup-html-parse, @thi.ng/imgui, @thi.ng/layout, @thi.ng/mime,
160
+ // @thi.ng/rdom-canvas, @thi.ng/scenegraph, @thi.ng/shader-ast, @thi.ng/shader-ast-glsl,
161
+ // @thi.ng/shader-ast-js, @thi.ng/shader-ast-optimize, @thi.ng/wasm-api-canvas,
162
+ // @thi.ng/wasm-api-webgl, @thi.ng/webgl, @thi.ng/webgl-msdf, @thi.ng/webgl-shadertoy
163
+ // ...
164
+ ```
82
165
 
83
166
  ## Authors
84
167
 
package/cluster.d.ts CHANGED
@@ -18,7 +18,12 @@ export declare const JACCARD_DIST_DENSE: Untransformed<ReadonlyVec>;
18
18
  * @param docs
19
19
  * @param opts
20
20
  */
21
- export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partial<KMeansOpts>) => import("@thi.ng/k-means").Cluster[];
21
+ export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partial<KMeansOpts>) => {
22
+ docs: ReadonlyVec[];
23
+ id: number;
24
+ centroid: ReadonlyVec;
25
+ items: number[];
26
+ }[];
22
27
  /**
23
28
  * k-means clustering for sparse multi-hot vectors. First converts vectors into
24
29
  * dense versions (using {@link toDense}), then calls {@link kmeansDense} to
@@ -34,7 +39,12 @@ export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partia
34
39
  */
35
40
  export declare const kmeansSparse: (k: number, docs: ReadonlyVec[], opts: Partial<KMeansOpts> & {
36
41
  dim: number;
37
- }) => import("@thi.ng/k-means").Cluster[];
42
+ }) => {
43
+ docs: ReadonlyVec[];
44
+ id: number;
45
+ centroid: ReadonlyVec;
46
+ items: number[];
47
+ }[];
38
48
  export declare function clusterBounds(docs: ReadonlyVec[]): {
39
49
  centroid: ReadonlyVec;
40
50
  radius: number;
package/cluster.js CHANGED
@@ -11,7 +11,10 @@ import { distSq } from "@thi.ng/vectors/distsq";
11
11
  import { mean } from "@thi.ng/vectors/mean";
12
12
  import { toDense } from "./vec.js";
13
13
  const JACCARD_DIST_DENSE = new Untransformed(distJaccard);
14
- const kmeansDense = (k, docs, opts) => kmeans(k, docs, { maxIter: 100, ...opts });
14
+ const kmeansDense = (k, docs, opts) => kmeans(k, docs, { maxIter: 100, ...opts }).map((cluster) => ({
15
+ ...cluster,
16
+ docs: lookup(docs, cluster.items)
17
+ }));
15
18
  const kmeansSparse = (k, docs, opts) => kmeansDense(
16
19
  k,
17
20
  docs.map((x) => toDense(opts.dim, x)),
package/index.d.ts CHANGED
@@ -1,3 +1,4 @@
1
+ export * from "./api.js";
1
2
  export * from "./cluster.js";
2
3
  export * from "./frequencies.js";
3
4
  export * from "./ngrams.js";
package/index.js CHANGED
@@ -1,3 +1,4 @@
1
+ export * from "./api.js";
1
2
  export * from "./cluster.js";
2
3
  export * from "./frequencies.js";
3
4
  export * from "./ngrams.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@thi.ng/text-analysis",
3
- "version": "0.2.0",
3
+ "version": "0.3.1",
4
4
  "description": "Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities",
5
5
  "type": "module",
6
6
  "module": "./index.js",
@@ -141,5 +141,5 @@
141
141
  "status": "alpha",
142
142
  "year": 2021
143
143
  },
144
- "gitHead": "14e994e531d32053e948768998324d443436a542\n"
144
+ "gitHead": "2e3adc4a5b737e21da697d6e935150c0855844dc\n"
145
145
  }
package/tf-idf.d.ts CHANGED
@@ -1,11 +1,56 @@
1
1
  import type { Fn2 } from "@thi.ng/api";
2
2
  import type { Vocab } from "./api.js";
3
+ /**
4
+ * TF weighting function for {@link defTFIDF}. Computes {@link frequencies} for
5
+ * given words/tokens (only includes those defined in `vocab`).
6
+ */
3
7
  export declare const tfCount: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
8
+ /**
9
+ * TF weighting function for {@link defTFIDF}. Computes {@link normFrequencies}
10
+ * for given words/tokens (only includes those defined in `vocab`).
11
+ */
4
12
  export declare const tfNormalized: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
13
+ /**
14
+ * TF weighting function for {@link defTFIDF}. First computes
15
+ * {@link frequencies} for given words/tokens (only includes those defined in
16
+ * `vocab`), then transforms each value via `log10(1 + count)`.
17
+ */
5
18
  export declare const tfLog: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
6
- export declare const defIDF: (fnIDF: (count: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
19
+ /**
20
+ * Higher order Inverse Document Frequency, using provided weighting strategy
21
+ * function.
22
+ *
23
+ * @remarks
24
+ * Also see {@link defTFIDF} for full tf-idf implementation.
25
+ *
26
+ * References:
27
+ *
28
+ * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
29
+ * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency
30
+ *
31
+ * Provided IDF impls for use with this function:
32
+ *
33
+ * - {@link idfClassic}
34
+ * - {@link idfSmooth}
35
+ * - {@link idfProbabilistic}
36
+ *
37
+ * @param fnIDF
38
+ */
39
+ export declare const defIDF: (fnIDF: (docsWithTerm: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
40
+ /**
41
+ * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
42
+ * `log10(numDocs / docsWithTerm)`
43
+ */
7
44
  export declare const idfClassic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
45
+ /**
46
+ * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
47
+ * `1 + log10(numDocs / (1 + docsWithTerm))`
48
+ */
8
49
  export declare const idfSmooth: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
50
+ /**
51
+ * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
52
+ * `log10((numDocs - docsWithTerm) / docsWithTerm)`
53
+ */
9
54
  export declare const idfProbabilistic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
10
55
  /**
11
56
  * Higher-order customizable tf-idf implementation, using provided fns for term
@@ -43,7 +88,7 @@ export declare const defTFIDF: (fnTF: Fn2<Vocab, string[], Map<string, number>>,
43
88
  *
44
89
  * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
45
90
  *
46
- * Also see {@link defTFIDF}
91
+ * Also see {@link defTFIDF}, {@link defIDF}.
47
92
  *
48
93
  * @param vocab
49
94
  * @param tokenizedDocs
package/tf-idf.js CHANGED
@@ -13,22 +13,22 @@ const tfLog = (vocab, docTokens) => {
13
13
  const defIDF = (fnIDF) => (vocab, tokenizedDocs) => {
14
14
  const acc = /* @__PURE__ */ new Map();
15
15
  for (const word of vocab.keys()) {
16
- let n = 0;
16
+ let count = 0;
17
17
  for (const doc of tokenizedDocs) {
18
- if (doc.includes(word)) n++;
18
+ if (doc.includes(word)) count++;
19
19
  }
20
- acc.set(word, fnIDF(n, tokenizedDocs.length));
20
+ acc.set(word, fnIDF(count, tokenizedDocs.length));
21
21
  }
22
22
  return acc;
23
23
  };
24
24
  const idfClassic = defIDF(
25
- (count, numDocs) => Math.log10(numDocs / count)
25
+ (docsWithTerm, numDocs) => log10(numDocs / docsWithTerm)
26
26
  );
27
27
  const idfSmooth = defIDF(
28
- (count, numDocs) => 1 + log10(numDocs / (1 + count))
28
+ (docsWithTerm, numDocs) => 1 + log10(numDocs / (1 + docsWithTerm))
29
29
  );
30
30
  const idfProbabilistic = defIDF(
31
- (count, numDocs) => log10((numDocs - count) / count)
31
+ (docsWithTerm, numDocs) => log10((numDocs - docsWithTerm) / docsWithTerm)
32
32
  );
33
33
  const defTFIDF = (fnTF, fnIDF) => (vocab, tokenizedDocs) => {
34
34
  const idf = fnIDF(vocab, tokenizedDocs);