count-tokens 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: count-tokens
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: count number of tokens in the text file using toktoken tokenizer from OpenAI
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Krystian Safjan
|
|
7
|
+
Author-email: ksafjan@gmail.com
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Requires-Dist: tiktoken (>=0.4.0,<0.5.0)
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# Count tokens
|
|
18
|
+
|
|
19
|
+
Simple tools that have one purpose - count tokens in a text file.
|
|
20
|
+
|
|
21
|
+
## Requirements
|
|
22
|
+
|
|
23
|
+
This package is using [tiktoken](https://github.com/openai/tiktoken) library for tokenization.
|
|
24
|
+
|
|
25
|
+
```shell
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
For usage from comman line install the package in isolated environement with pipx:
|
|
29
|
+
|
|
30
|
+
```sh
|
|
31
|
+
$ pipx install count_tokens
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
or install it in your current environment with pip.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
Open terminal and run:
|
|
39
|
+
|
|
40
|
+
```shell
|
|
41
|
+
$ count-tokens document.txt
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
You should see something like this:
|
|
45
|
+
|
|
46
|
+
```shell
|
|
47
|
+
File: document.txt
|
|
48
|
+
Encoding: cl100k_base
|
|
49
|
+
Number of tokens: 67
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
if you want to see just the tokens count run:
|
|
53
|
+
|
|
54
|
+
```shell
|
|
55
|
+
$ count-tokens document.txt --quiet
|
|
56
|
+
```
|
|
57
|
+
and the output will be:
|
|
58
|
+
|
|
59
|
+
```shell
|
|
60
|
+
67
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Related Projects
|
|
64
|
+
- [tiktoken](https://github.com/openai/tiktoken) - tokenization library used by this package
|
|
65
|
+
|
|
66
|
+
## Credits
|
|
67
|
+
|
|
68
|
+
Thanks to the authors of the tiktoken library for open sourcing their work.
|
|
69
|
+
|
|
70
|
+
## License
|
|
71
|
+
|
|
72
|
+
[MIT](https://izikeros.mit-license.org/) © [Krystian Safjan](https://safjan.com).
|
|
73
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Count tokens
|
|
2
|
+
|
|
3
|
+
Simple tools that have one purpose - count tokens in a text file.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
This package is using [tiktoken](https://github.com/openai/tiktoken) library for tokenization.
|
|
8
|
+
|
|
9
|
+
```shell
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
For usage from comman line install the package in isolated environement with pipx:
|
|
13
|
+
|
|
14
|
+
```sh
|
|
15
|
+
$ pipx install count_tokens
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
or install it in your current environment with pip.
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
Open terminal and run:
|
|
23
|
+
|
|
24
|
+
```shell
|
|
25
|
+
$ count-tokens document.txt
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
You should see something like this:
|
|
29
|
+
|
|
30
|
+
```shell
|
|
31
|
+
File: document.txt
|
|
32
|
+
Encoding: cl100k_base
|
|
33
|
+
Number of tokens: 67
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
if you want to see just the tokens count run:
|
|
37
|
+
|
|
38
|
+
```shell
|
|
39
|
+
$ count-tokens document.txt --quiet
|
|
40
|
+
```
|
|
41
|
+
and the output will be:
|
|
42
|
+
|
|
43
|
+
```shell
|
|
44
|
+
67
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Related Projects
|
|
48
|
+
- [tiktoken](https://github.com/openai/tiktoken) - tokenization library used by this package
|
|
49
|
+
|
|
50
|
+
## Credits
|
|
51
|
+
|
|
52
|
+
Thanks to the authors of the tiktoken library for open sourcing their work.
|
|
53
|
+
|
|
54
|
+
## License
|
|
55
|
+
|
|
56
|
+
[MIT](https://izikeros.mit-license.org/) © [Krystian Safjan](https://safjan.com).
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
|
|
4
|
+
import tiktoken
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
|
|
8
|
+
"""Returns the number of tokens in a text string."""
|
|
9
|
+
encoding = tiktoken.get_encoding(encoding_name)
|
|
10
|
+
num_tokens = len(encoding.encode(string))
|
|
11
|
+
return num_tokens
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
num_tokens_from_string(
|
|
15
|
+
"tiktoken is great!",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def count_tokens(file_path, encoding_name):
|
|
20
|
+
with open(file_path) as file:
|
|
21
|
+
text = file.read()
|
|
22
|
+
return num_tokens_from_string(text, encoding_name)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def main():
|
|
26
|
+
parser = argparse.ArgumentParser(
|
|
27
|
+
description="Count the number of tokens in a text file."
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument("file", help="Path to the input text file")
|
|
30
|
+
# add option -q quiets the output
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"-q", "--quiet", action="store_true", help="Print only the number of tokens"
|
|
33
|
+
)
|
|
34
|
+
# add option -e to specify the encoding
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"-e",
|
|
37
|
+
"--encoding",
|
|
38
|
+
default="cl100k_base",
|
|
39
|
+
help="Encoding to use (default: cl100k_base)",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
args = parser.parse_args()
|
|
43
|
+
file_path = args.file
|
|
44
|
+
encoding_name = args.encoding
|
|
45
|
+
|
|
46
|
+
num_tokens = count_tokens(file_path, encoding_name)
|
|
47
|
+
if not args.quiet:
|
|
48
|
+
print(f"File: {file_path}")
|
|
49
|
+
print(f"Encoding: {encoding_name}")
|
|
50
|
+
print(f"Number of tokens: {num_tokens}")
|
|
51
|
+
else:
|
|
52
|
+
print(num_tokens)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
main()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "count-tokens"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "count number of tokens in the text file using toktoken tokenizer from OpenAI\u001b"
|
|
5
|
+
authors = ["Krystian Safjan <ksafjan@gmail.com>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
packages = [{include = "count_tokens"}]
|
|
9
|
+
|
|
10
|
+
[tool.poetry.dependencies]
|
|
11
|
+
python = "^3.9"
|
|
12
|
+
tiktoken = "^0.4.0"
|
|
13
|
+
|
|
14
|
+
[tool.poetry.scripts]
|
|
15
|
+
count-tokens = 'count_tokens.count:main'
|
|
16
|
+
|
|
17
|
+
[tool.poetry.group.dev.dependencies]
|
|
18
|
+
tox = "^4.6.3"
|
|
19
|
+
pre-commit = "^3.3.3"
|
|
20
|
+
pytest = "^7.4.0"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["poetry-core"]
|
|
24
|
+
build-backend = "poetry.core.masonry.api"
|