@thi.ng/text-analysis 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Change Log
2
2
 
3
- - **Last updated**: 2025-06-15T18:41:17Z
3
+ - **Last updated**: 2025-06-18T12:01:21Z
4
4
  - **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
5
5
 
6
6
  All notable changes to this project will be documented in this file.
@@ -11,6 +11,17 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
11
11
  **Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
12
12
  and/or version bumps of transitive dependencies.
13
13
 
14
+ ## [0.4.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.4.0) (2025-06-18)
15
+
16
+ #### 🚀 Features
17
+
18
+ - add `filterDocsFrequency()` ([6ac1f90](https://github.com/thi-ng/umbrella/commit/6ac1f90))
19
+
20
+ #### ⏱ Performance improvements
21
+
22
+ - minor update kmeansDense() ([ebd5618](https://github.com/thi-ng/umbrella/commit/ebd5618))
23
+ - internal use `lookupUnsafe()`
24
+
14
25
  ### [0.3.1](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.1) (2025-06-15)
15
26
 
16
27
  #### 🩹 Bug fixes
package/README.md CHANGED
@@ -59,7 +59,7 @@ For Node.js REPL:
59
59
  const ta = await import("@thi.ng/text-analysis");
60
60
  ```
61
61
 
62
- Package sizes (brotli'd, pre-treeshake): ESM: 3.33 KB
62
+ Package sizes (brotli'd, pre-treeshake): ESM: 3.37 KB
63
63
 
64
64
  ## Dependencies
65
65
 
package/frequencies.d.ts CHANGED
@@ -1,3 +1,4 @@
1
+ import type { Fn, Fn2 } from "@thi.ng/api";
1
2
  import { frequencies as $freq } from "@thi.ng/transducers/frequencies";
2
3
  import { normFrequenciesAuto as $norm } from "@thi.ng/transducers/norm-frequencies-auto";
3
4
  import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequencies";
@@ -59,4 +60,43 @@ export declare const normFrequencies: typeof $norm;
59
60
  * ```
60
61
  */
61
62
  export declare const sortedFrequencies: typeof $sorted;
63
+ /**
64
+ * Takes an array of tokenized documents, a histogram function (`frequencies`)
65
+ * and a predicate function (`pred`). First computes the combined histogram of
66
+ * terms/works in all given docs using `frequencies`, then filters each document
67
+ * using supplied predicate, which is called with a single word/token and its
68
+ * computed frequency. Only words are kept for which the predicate succeeds.
69
+ *
70
+ * @remarks
71
+ * See {@link frequencies} and {@link normFrequencies} for histogram creation.
72
+ *
73
+ * @example
74
+ * ```ts tangle:../export/filter-docs-frequency.ts
75
+ * import { filterDocsFrequency, frequencies } from "@thi.ng/text-analysis";
76
+ *
77
+ * const docs = [
78
+ * ["a", "b", "c"],
79
+ * ["a", "b", "d", "e"],
80
+ * ["b", "f", "g"],
81
+ * ["a", "b", "c", "f"],
82
+ * ["a", "g", "h"]
83
+ * ];
84
+ *
85
+ * // only keep words which occur more than once
86
+ * const filtered = filterDocsFrequency(docs, frequencies, (_, x) => x > 1);
87
+ *
88
+ * // show before & after
89
+ * for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]);
90
+ * // [ "a", "b", "c" ] => [ "a", "b", "c" ]
91
+ * // [ "a", "b", "d", "e" ] => [ "a", "b" ]
92
+ * // [ "b", "f", "g" ] => [ "b", "f", "g" ]
93
+ * // [ "a", "b", "c", "f" ] => [ "a", "b", "c", "f" ]
94
+ * // [ "a", "g", "h" ] => [ "a", "g" ]
95
+ * ```
96
+ *
97
+ * @param docs
98
+ * @param frequencies
99
+ * @param pred
100
+ */
101
+ export declare const filterDocsFrequency: (docs: string[][], frequencies: Fn<Iterable<string>, Map<string, number>>, pred: Fn2<string, number, boolean>) => string[][];
62
102
  //# sourceMappingURL=frequencies.d.ts.map
package/frequencies.js CHANGED
@@ -4,7 +4,17 @@ import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequen
4
4
  const frequencies = $freq;
5
5
  const normFrequencies = $norm;
6
6
  const sortedFrequencies = $sorted;
7
+ const filterDocsFrequency = (docs, frequencies2, pred) => {
8
+ const histogram = frequencies2(docs.flat());
9
+ return docs.map(
10
+ (doc) => doc.filter((word) => {
11
+ const freq = histogram.get(word);
12
+ return freq !== void 0 && pred(word, freq);
13
+ })
14
+ );
15
+ };
7
16
  export {
17
+ filterDocsFrequency,
8
18
  frequencies,
9
19
  normFrequencies,
10
20
  sortedFrequencies
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@thi.ng/text-analysis",
3
- "version": "0.3.1",
3
+ "version": "0.4.0",
4
4
  "description": "Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities",
5
5
  "type": "module",
6
6
  "module": "./index.js",
@@ -40,14 +40,14 @@
40
40
  },
41
41
  "dependencies": {
42
42
  "@thi.ng/api": "^8.11.29",
43
- "@thi.ng/arrays": "^2.12.0",
43
+ "@thi.ng/arrays": "^2.13.0",
44
44
  "@thi.ng/bidir-index": "^1.3.0",
45
45
  "@thi.ng/checks": "^3.7.9",
46
- "@thi.ng/distance": "^3.0.0",
47
- "@thi.ng/k-means": "^1.1.0",
48
- "@thi.ng/strings": "^3.9.14",
49
- "@thi.ng/transducers": "^9.4.1",
50
- "@thi.ng/vectors": "^8.3.0"
46
+ "@thi.ng/distance": "^3.0.1",
47
+ "@thi.ng/k-means": "^1.1.1",
48
+ "@thi.ng/strings": "^3.9.15",
49
+ "@thi.ng/transducers": "^9.4.2",
50
+ "@thi.ng/vectors": "^8.3.1"
51
51
  },
52
52
  "devDependencies": {
53
53
  "esbuild": "^0.25.5",
@@ -141,5 +141,5 @@
141
141
  "status": "alpha",
142
142
  "year": 2021
143
143
  },
144
- "gitHead": "2e3adc4a5b737e21da697d6e935150c0855844dc\n"
144
+ "gitHead": "b076434a497b291ad33e81b1a15f6a71e2c82cc2\n"
145
145
  }