@kt3k/tku 1.0.9 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.js +54 -26
- package/dist/tokenize-worker.js +30 -0
- package/package.json +3 -3
package/dist/main.js
CHANGED
|
@@ -75,35 +75,61 @@ async function listTextFiles(repoPath, options = {}) {
|
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
// src/tokenize.ts
|
|
78
|
-
import { readFile } from "node:fs/promises";
|
|
79
78
|
import { resolve as resolve2 } from "node:path";
|
|
79
|
+
import { availableParallelism } from "node:os";
|
|
80
|
+
import { Worker } from "node:worker_threads";
|
|
80
81
|
import { get_encoding } from "tiktoken";
|
|
82
|
+
var workerExt = import.meta.url.endsWith(".ts") ? ".ts" : ".js";
|
|
83
|
+
var workerUrl = new URL(`./tokenize-worker${workerExt}`, import.meta.url);
|
|
81
84
|
async function tokenizeFiles(repoPath, files, encoding, options = {}) {
|
|
82
|
-
const
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
const fullPath = resolve2(repoPath, file);
|
|
90
|
-
const content = await readFile(fullPath, "utf-8");
|
|
91
|
-
const tokens = enc.encode_ordinary(content).length;
|
|
92
|
-
results.push({
|
|
93
|
-
path: file,
|
|
94
|
-
tokens
|
|
95
|
-
});
|
|
96
|
-
totalTokens += tokens;
|
|
97
|
-
}
|
|
98
|
-
return {
|
|
99
|
-
encoding,
|
|
100
|
-
files: results,
|
|
101
|
-
totalTokens,
|
|
102
|
-
totalFiles: results.length
|
|
103
|
-
};
|
|
104
|
-
} finally {
|
|
105
|
-
enc.free();
|
|
85
|
+
const resolvedRepoPath = resolve2(repoPath);
|
|
86
|
+
const numWorkers = Math.min(availableParallelism(), files.length || 1);
|
|
87
|
+
const chunks = Array.from({
|
|
88
|
+
length: numWorkers
|
|
89
|
+
}, () => []);
|
|
90
|
+
for (let i = 0; i < files.length; i++) {
|
|
91
|
+
chunks[i % numWorkers].push(files[i]);
|
|
106
92
|
}
|
|
93
|
+
let progressCount = 0;
|
|
94
|
+
const total = files.length;
|
|
95
|
+
const workerPromises = chunks.map((chunk) => {
|
|
96
|
+
if (chunk.length === 0) return Promise.resolve([]);
|
|
97
|
+
return new Promise((resolvePromise, reject) => {
|
|
98
|
+
const worker = new Worker(workerUrl);
|
|
99
|
+
worker.on("message", (msg) => {
|
|
100
|
+
if (msg.type === "progress") {
|
|
101
|
+
progressCount++;
|
|
102
|
+
options.onProgress?.(msg.file, progressCount, total);
|
|
103
|
+
} else if (msg.type === "done") {
|
|
104
|
+
resolvePromise(msg.results);
|
|
105
|
+
worker.terminate();
|
|
106
|
+
}
|
|
107
|
+
});
|
|
108
|
+
worker.on("error", (err) => {
|
|
109
|
+
reject(err);
|
|
110
|
+
worker.terminate();
|
|
111
|
+
});
|
|
112
|
+
worker.postMessage({
|
|
113
|
+
repoPath: resolvedRepoPath,
|
|
114
|
+
files: chunk,
|
|
115
|
+
encoding
|
|
116
|
+
});
|
|
117
|
+
});
|
|
118
|
+
});
|
|
119
|
+
const chunkResults = await Promise.all(workerPromises);
|
|
120
|
+
const results = chunkResults.flat();
|
|
121
|
+
const orderMap = new Map(files.map((f, i) => [
|
|
122
|
+
f,
|
|
123
|
+
i
|
|
124
|
+
]));
|
|
125
|
+
results.sort((a, b) => orderMap.get(a.path) - orderMap.get(b.path));
|
|
126
|
+
const totalTokens = results.reduce((sum, f) => sum + f.tokens, 0);
|
|
127
|
+
return {
|
|
128
|
+
encoding,
|
|
129
|
+
files: results,
|
|
130
|
+
totalTokens,
|
|
131
|
+
totalFiles: results.length
|
|
132
|
+
};
|
|
107
133
|
}
|
|
108
134
|
|
|
109
135
|
// src/format.ts
|
|
@@ -214,7 +240,9 @@ async function main() {
|
|
|
214
240
|
const isTTY = process.stderr.isTTY;
|
|
215
241
|
function status(msg) {
|
|
216
242
|
if (isTTY) {
|
|
217
|
-
process.stderr.
|
|
243
|
+
const cols = process.stderr.columns || 80;
|
|
244
|
+
const truncated = msg.length > cols ? msg.slice(0, cols) : msg;
|
|
245
|
+
process.stderr.write(`\r\x1B[K${truncated}`);
|
|
218
246
|
}
|
|
219
247
|
}
|
|
220
248
|
function clearStatus() {
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
// src/tokenize-worker.ts
|
|
2
|
+
import { parentPort } from "node:worker_threads";
|
|
3
|
+
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { resolve } from "node:path";
|
|
5
|
+
import { get_encoding } from "tiktoken";
|
|
6
|
+
parentPort.on("message", async (msg) => {
|
|
7
|
+
const enc = get_encoding(msg.encoding);
|
|
8
|
+
try {
|
|
9
|
+
const results = [];
|
|
10
|
+
for (const file of msg.files) {
|
|
11
|
+
const fullPath = resolve(msg.repoPath, file);
|
|
12
|
+
const content = await readFile(fullPath, "utf-8");
|
|
13
|
+
const tokens = enc.encode_ordinary(content).length;
|
|
14
|
+
results.push({
|
|
15
|
+
path: file,
|
|
16
|
+
tokens
|
|
17
|
+
});
|
|
18
|
+
parentPort.postMessage({
|
|
19
|
+
type: "progress",
|
|
20
|
+
file
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
parentPort.postMessage({
|
|
24
|
+
type: "done",
|
|
25
|
+
results
|
|
26
|
+
});
|
|
27
|
+
} finally {
|
|
28
|
+
enc.free();
|
|
29
|
+
}
|
|
30
|
+
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kt3k/tku",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.11",
|
|
4
4
|
"description": "",
|
|
5
5
|
"main": "dist/main.js",
|
|
6
6
|
"bin": {
|
|
@@ -10,8 +10,8 @@
|
|
|
10
10
|
"dist"
|
|
11
11
|
],
|
|
12
12
|
"scripts": {
|
|
13
|
-
"build": "deno bundle --external tiktoken --external picomatch -o dist/main.js src/main.ts && chmod +x dist/main.js",
|
|
14
|
-
"
|
|
13
|
+
"build": "deno bundle --external tiktoken --external picomatch -o dist/main.js src/main.ts && deno bundle --external tiktoken -o dist/tokenize-worker.js src/tokenize-worker.ts && chmod +x dist/main.js",
|
|
14
|
+
"prepublishOnly": "npm run build",
|
|
15
15
|
"test": "vitest run"
|
|
16
16
|
},
|
|
17
17
|
"keywords": [],
|