count-tokens 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.1
2
+ Name: count-tokens
3
+ Version: 0.1.0
4
+ Summary: count number of tokens in the text file using toktoken tokenizer from OpenAI
5
+ License: MIT
6
+ Author: Krystian Safjan
7
+ Author-email: ksafjan@gmail.com
8
+ Requires-Python: >=3.9,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Requires-Dist: tiktoken (>=0.4.0,<0.5.0)
15
+ Description-Content-Type: text/markdown
16
+
17
+ # Count tokens
18
+
19
+ Simple tools that have one purpose - count tokens in a text file.
20
+
21
+ ## Requirements
22
+
23
+ This package is using [tiktoken](https://github.com/openai/tiktoken) library for tokenization.
24
+
25
+ ```shell
26
+
27
+ ## Installation
28
+ For usage from comman line install the package in isolated environement with pipx:
29
+
30
+ ```sh
31
+ $ pipx install count_tokens
32
+ ```
33
+
34
+ or install it in your current environment with pip.
35
+
36
+
37
+ ## Usage
38
+ Open terminal and run:
39
+
40
+ ```shell
41
+ $ count-tokens document.txt
42
+ ```
43
+
44
+ You should see something like this:
45
+
46
+ ```shell
47
+ File: document.txt
48
+ Encoding: cl100k_base
49
+ Number of tokens: 67
50
+ ```
51
+
52
+ if you want to see just the tokens count run:
53
+
54
+ ```shell
55
+ $ count-tokens document.txt --quiet
56
+ ```
57
+ and the output will be:
58
+
59
+ ```shell
60
+ 67
61
+ ```
62
+
63
+ ## Related Projects
64
+ - [tiktoken](https://github.com/openai/tiktoken) - tokenization library used by this package
65
+
66
+ ## Credits
67
+
68
+ Thanks to the authors of the tiktoken library for open sourcing their work.
69
+
70
+ ## License
71
+
72
+ [MIT](https://izikeros.mit-license.org/) © [Krystian Safjan](https://safjan.com).
73
+
@@ -0,0 +1,56 @@
1
+ # Count tokens
2
+
3
+ Simple tools that have one purpose - count tokens in a text file.
4
+
5
+ ## Requirements
6
+
7
+ This package is using [tiktoken](https://github.com/openai/tiktoken) library for tokenization.
8
+
9
+ ```shell
10
+
11
+ ## Installation
12
+ For usage from comman line install the package in isolated environement with pipx:
13
+
14
+ ```sh
15
+ $ pipx install count_tokens
16
+ ```
17
+
18
+ or install it in your current environment with pip.
19
+
20
+
21
+ ## Usage
22
+ Open terminal and run:
23
+
24
+ ```shell
25
+ $ count-tokens document.txt
26
+ ```
27
+
28
+ You should see something like this:
29
+
30
+ ```shell
31
+ File: document.txt
32
+ Encoding: cl100k_base
33
+ Number of tokens: 67
34
+ ```
35
+
36
+ if you want to see just the tokens count run:
37
+
38
+ ```shell
39
+ $ count-tokens document.txt --quiet
40
+ ```
41
+ and the output will be:
42
+
43
+ ```shell
44
+ 67
45
+ ```
46
+
47
+ ## Related Projects
48
+ - [tiktoken](https://github.com/openai/tiktoken) - tokenization library used by this package
49
+
50
+ ## Credits
51
+
52
+ Thanks to the authors of the tiktoken library for open sourcing their work.
53
+
54
+ ## License
55
+
56
+ [MIT](https://izikeros.mit-license.org/) © [Krystian Safjan](https://safjan.com).
File without changes
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+
4
+ import tiktoken
5
+
6
+
7
+ def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
8
+ """Returns the number of tokens in a text string."""
9
+ encoding = tiktoken.get_encoding(encoding_name)
10
+ num_tokens = len(encoding.encode(string))
11
+ return num_tokens
12
+
13
+
14
+ num_tokens_from_string(
15
+ "tiktoken is great!",
16
+ )
17
+
18
+
19
+ def count_tokens(file_path, encoding_name):
20
+ with open(file_path) as file:
21
+ text = file.read()
22
+ return num_tokens_from_string(text, encoding_name)
23
+
24
+
25
+ def main():
26
+ parser = argparse.ArgumentParser(
27
+ description="Count the number of tokens in a text file."
28
+ )
29
+ parser.add_argument("file", help="Path to the input text file")
30
+ # add option -q quiets the output
31
+ parser.add_argument(
32
+ "-q", "--quiet", action="store_true", help="Print only the number of tokens"
33
+ )
34
+ # add option -e to specify the encoding
35
+ parser.add_argument(
36
+ "-e",
37
+ "--encoding",
38
+ default="cl100k_base",
39
+ help="Encoding to use (default: cl100k_base)",
40
+ )
41
+
42
+ args = parser.parse_args()
43
+ file_path = args.file
44
+ encoding_name = args.encoding
45
+
46
+ num_tokens = count_tokens(file_path, encoding_name)
47
+ if not args.quiet:
48
+ print(f"File: {file_path}")
49
+ print(f"Encoding: {encoding_name}")
50
+ print(f"Number of tokens: {num_tokens}")
51
+ else:
52
+ print(num_tokens)
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -0,0 +1,24 @@
1
+ [tool.poetry]
2
+ name = "count-tokens"
3
+ version = "0.1.0"
4
+ description = "count number of tokens in the text file using toktoken tokenizer from OpenAI\u001b"
5
+ authors = ["Krystian Safjan <ksafjan@gmail.com>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ packages = [{include = "count_tokens"}]
9
+
10
+ [tool.poetry.dependencies]
11
+ python = "^3.9"
12
+ tiktoken = "^0.4.0"
13
+
14
+ [tool.poetry.scripts]
15
+ count-tokens = 'count_tokens.count:main'
16
+
17
+ [tool.poetry.group.dev.dependencies]
18
+ tox = "^4.6.3"
19
+ pre-commit = "^3.3.3"
20
+ pytest = "^7.4.0"
21
+
22
+ [build-system]
23
+ requires = ["poetry-core"]
24
+ build-backend = "poetry.core.masonry.api"