tokenfill 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin.d.ts +2 -0
- package/dist/bin.js +6 -0
- package/dist/cli.d.ts +9 -0
- package/dist/cli.js +65 -0
- package/dist/corpus/001-archaeoastronomy.md +479 -0
- package/dist/corpus/002-magnetohydrodynamics.md +475 -0
- package/dist/corpus/003-biosemiotics.md +483 -0
- package/dist/corpus/004-cryopedology.md +483 -0
- package/dist/corpus/005-geomicrobiology.md +479 -0
- package/dist/corpus/006-aeronomy.md +487 -0
- package/dist/corpus/007-paleoclimatology.md +479 -0
- package/dist/corpus/008-hydrogeophysics.md +479 -0
- package/dist/corpus/009-magnetostratigraphy.md +475 -0
- package/dist/corpus/010-isotope-hydrology.md +481 -0
- package/dist/corpus/011-speleothem-geochemistry.md +474 -0
- package/dist/corpus/012-astrobiogeochemistry.md +475 -0
- package/dist/corpus/013-neuroethology.md +483 -0
- package/dist/corpus/014-chronophysiology.md +483 -0
- package/dist/corpus/015-limnogeochemistry.md +475 -0
- package/dist/corpus/016-palynology.md +483 -0
- package/dist/corpus/017-volcanotectonics.md +473 -0
- package/dist/corpus/018-seismotectonics.md +473 -0
- package/dist/corpus/019-biogeomorphology.md +475 -0
- package/dist/corpus/020-geobiophysics.md +479 -0
- package/dist/corpus/021-phytolith-analysis.md +481 -0
- package/dist/corpus/022-archaeometallurgy.md +479 -0
- package/dist/corpus/023-paleomagnetism.md +479 -0
- package/dist/corpus/024-biocalorimetry.md +475 -0
- package/dist/corpus/025-atmospheric-chemiluminescence.md +473 -0
- package/dist/corpus/026-cryoseismology.md +479 -0
- package/dist/corpus/027-extremophile-radiobiology.md +475 -0
- package/dist/corpus/028-heliophysics.md +479 -0
- package/dist/corpus/029-astroparticle-geophysics.md +474 -0
- package/dist/corpus/030-glaciohydrology.md +479 -0
- package/dist/corpus/031-permafrost-microbiology.md +477 -0
- package/dist/corpus/032-ecoacoustics.md +479 -0
- package/dist/corpus/033-dendroclimatology.md +473 -0
- package/dist/corpus/034-ionospheric-tomography.md +477 -0
- package/dist/corpus/035-marine-geodesy.md +481 -0
- package/dist/corpus/036-sedimentary-ancient-dna.md +481 -0
- package/dist/corpus/037-myrmecochory-dynamics.md +474 -0
- package/dist/corpus/038-chemosensory-ecology.md +477 -0
- package/dist/corpus/039-spintronics-materials.md +479 -0
- package/dist/corpus/040-nanotoxicology.md +483 -0
- package/dist/corpus/041-cosmochemistry.md +483 -0
- package/dist/corpus/042-quaternary-geochronology.md +471 -0
- package/dist/corpus/043-biophotonics.md +479 -0
- package/dist/corpus/044-evolutionary-morphometrics.md +481 -0
- package/dist/corpus/045-cryovolcanology.md +475 -0
- package/dist/corpus/046-exoplanet-atmospheric-dynamics.md +479 -0
- package/dist/corpus/047-microbial-electrosynthesis.md +477 -0
- package/dist/corpus/048-paleoseismology.md +479 -0
- package/dist/corpus/049-actinide-geochemistry.md +477 -0
- package/dist/corpus/050-quantum-biology.md +489 -0
- package/dist/corpus.d.ts +2 -0
- package/dist/corpus.js +19 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +2 -0
- package/dist/tokenfill.d.ts +9 -0
- package/dist/tokenfill.js +34 -0
- package/dist/tokenizer.d.ts +14 -0
- package/dist/tokenizer.js +31 -0
- package/package.json +27 -0
package/dist/corpus.d.ts
ADDED
package/dist/corpus.js
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { readdirSync, readFileSync } from "node:fs";
|
|
2
|
+
import { dirname, join } from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
export const CORPUS_ARTICLE_SEPARATOR = "\n\n";
|
|
5
|
+
const corpusDirectoryPath = join(dirname(fileURLToPath(import.meta.url)), "corpus");
|
|
6
|
+
function getCorpusFileNames() {
|
|
7
|
+
return readdirSync(corpusDirectoryPath, { withFileTypes: true })
|
|
8
|
+
.filter(entry => entry.isFile() && entry.name.endsWith(".md"))
|
|
9
|
+
.map(entry => entry.name)
|
|
10
|
+
.sort((left, right) => left.localeCompare(right));
|
|
11
|
+
}
|
|
12
|
+
function loadBuiltInCorpusArticles() {
|
|
13
|
+
const corpusFileNames = getCorpusFileNames();
|
|
14
|
+
if (corpusFileNames.length === 0) {
|
|
15
|
+
throw new Error(`No built-in corpus markdown files found in ${corpusDirectoryPath}`);
|
|
16
|
+
}
|
|
17
|
+
return corpusFileNames.map(fileName => readFileSync(join(corpusDirectoryPath, fileName), "utf8").trim());
|
|
18
|
+
}
|
|
19
|
+
export const BUILT_IN_CORPUS_ARTICLES = loadBuiltInCorpusArticles();
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { TiktokenEncoding } from "tiktoken";
|
|
2
|
+
export interface TokenfillOptions {
|
|
3
|
+
encoding?: TiktokenEncoding;
|
|
4
|
+
}
|
|
5
|
+
export interface TokenfillResult {
|
|
6
|
+
text: string;
|
|
7
|
+
actualTokens: number;
|
|
8
|
+
}
|
|
9
|
+
export declare function tokenfill(tokenCount: number, options?: TokenfillOptions): TokenfillResult;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { BUILT_IN_CORPUS_ARTICLES, CORPUS_ARTICLE_SEPARATOR } from "./corpus.js";
|
|
2
|
+
import { createTokenizer } from "./tokenizer.js";
|
|
3
|
+
const builtInCorpusText = BUILT_IN_CORPUS_ARTICLES.join(CORPUS_ARTICLE_SEPARATOR);
|
|
4
|
+
const corpusTokensByEncoding = new Map();
|
|
5
|
+
function getCorpusTokens(encoding, encode) {
|
|
6
|
+
const cachedTokens = corpusTokensByEncoding.get(encoding);
|
|
7
|
+
if (cachedTokens) {
|
|
8
|
+
return cachedTokens;
|
|
9
|
+
}
|
|
10
|
+
const encodedCorpus = encode(builtInCorpusText);
|
|
11
|
+
corpusTokensByEncoding.set(encoding, encodedCorpus);
|
|
12
|
+
return encodedCorpus;
|
|
13
|
+
}
|
|
14
|
+
export function tokenfill(tokenCount, options = {}) {
|
|
15
|
+
if (!Number.isInteger(tokenCount) || tokenCount < 0) {
|
|
16
|
+
throw new TypeError(`tokenCount must be a non-negative integer, received ${tokenCount}`);
|
|
17
|
+
}
|
|
18
|
+
const tokenizer = createTokenizer({ encoding: options.encoding });
|
|
19
|
+
try {
|
|
20
|
+
const corpusTokens = getCorpusTokens(tokenizer.encoding, tokenizer.encode);
|
|
21
|
+
const maxCorpusTokens = corpusTokens.length;
|
|
22
|
+
if (tokenCount > maxCorpusTokens) {
|
|
23
|
+
throw new Error(`Requested token count ${tokenCount} exceeds built-in corpus size ${maxCorpusTokens} for encoding ${tokenizer.encoding}`);
|
|
24
|
+
}
|
|
25
|
+
const text = tokenizer.decode(corpusTokens.slice(0, tokenCount));
|
|
26
|
+
return {
|
|
27
|
+
text,
|
|
28
|
+
actualTokens: tokenCount
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
finally {
|
|
32
|
+
tokenizer.free();
|
|
33
|
+
}
|
|
34
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { type TiktokenEncoding } from "tiktoken";
|
|
2
|
+
export declare const DEFAULT_ENCODING: TiktokenEncoding;
|
|
3
|
+
export interface TokenizerOptions {
|
|
4
|
+
encoding?: TiktokenEncoding;
|
|
5
|
+
}
|
|
6
|
+
export interface Tokenizer {
|
|
7
|
+
readonly encoding: TiktokenEncoding;
|
|
8
|
+
encode(text: string): Uint32Array;
|
|
9
|
+
decode(tokens: Uint32Array | number[]): string;
|
|
10
|
+
count(text: string): number;
|
|
11
|
+
truncate(text: string, tokenCount: number): string;
|
|
12
|
+
free(): void;
|
|
13
|
+
}
|
|
14
|
+
export declare function createTokenizer(options?: TokenizerOptions): Tokenizer;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { get_encoding } from "tiktoken";
|
|
2
|
+
export const DEFAULT_ENCODING = "cl100k_base";
|
|
3
|
+
export function createTokenizer(options = {}) {
|
|
4
|
+
const encoding = options.encoding ?? DEFAULT_ENCODING;
|
|
5
|
+
const tokenizer = get_encoding(encoding);
|
|
6
|
+
const utf8Decoder = new TextDecoder();
|
|
7
|
+
const encode = (text) => tokenizer.encode(text);
|
|
8
|
+
const decode = (tokens) => {
|
|
9
|
+
const tokenArray = tokens instanceof Uint32Array ? tokens : Uint32Array.from(tokens);
|
|
10
|
+
return utf8Decoder.decode(tokenizer.decode(tokenArray));
|
|
11
|
+
};
|
|
12
|
+
const count = (text) => encode(text).length;
|
|
13
|
+
const truncate = (text, tokenCount) => {
|
|
14
|
+
if (tokenCount <= 0) {
|
|
15
|
+
return "";
|
|
16
|
+
}
|
|
17
|
+
const tokens = encode(text);
|
|
18
|
+
if (tokens.length <= tokenCount) {
|
|
19
|
+
return text;
|
|
20
|
+
}
|
|
21
|
+
return decode(tokens.slice(0, tokenCount));
|
|
22
|
+
};
|
|
23
|
+
return {
|
|
24
|
+
encoding,
|
|
25
|
+
encode,
|
|
26
|
+
decode,
|
|
27
|
+
count,
|
|
28
|
+
truncate,
|
|
29
|
+
free: () => tokenizer.free()
|
|
30
|
+
};
|
|
31
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "tokenfill",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"private": false,
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"bin": {
|
|
9
|
+
"tokenfill": "dist/bin.js"
|
|
10
|
+
},
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"import": "./dist/index.js"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "tsc && node ./scripts/copy-corpus.mjs"
|
|
19
|
+
},
|
|
20
|
+
"files": [
|
|
21
|
+
"dist"
|
|
22
|
+
],
|
|
23
|
+
"dependencies": {
|
|
24
|
+
"commander": "^14.0.3",
|
|
25
|
+
"tiktoken": "^1.0.22"
|
|
26
|
+
}
|
|
27
|
+
}
|