fibpetokenizer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fibpetokenizer-0.1.0/.gitignore +1 -0
- fibpetokenizer-0.1.0/CHANGELOG.md +32 -0
- fibpetokenizer-0.1.0/CONTRIBUTING.md +147 -0
- fibpetokenizer-0.1.0/Cargo.lock +441 -0
- fibpetokenizer-0.1.0/Cargo.toml +38 -0
- fibpetokenizer-0.1.0/LICENSE-APACHE +175 -0
- fibpetokenizer-0.1.0/LICENSE-MIT +21 -0
- fibpetokenizer-0.1.0/PKG-INFO +238 -0
- fibpetokenizer-0.1.0/README.md +206 -0
- fibpetokenizer-0.1.0/example_python.py +72 -0
- fibpetokenizer-0.1.0/examples/basic_usage.rs +75 -0
- fibpetokenizer-0.1.0/pyproject.toml +42 -0
- fibpetokenizer-0.1.0/src/bpe_tokenizer.rs +1015 -0
- fibpetokenizer-0.1.0/src/lib.rs +49 -0
- fibpetokenizer-0.1.0/src/main.rs +74 -0
- fibpetokenizer-0.1.0/src/pretokenizer.rs +49 -0
- fibpetokenizer-0.1.0/src/python_wrapper.rs +204 -0
- fibpetokenizer-0.1.0/src/tokenizer_tables.rs +51 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
/target
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-03-03
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- Initial release of FIBpeTokenizer
|
|
14
|
+
- Core BPE tokenization algorithm with parallel processing
|
|
15
|
+
- Pre-tokenization strategies (Whitespace, Punctuation)
|
|
16
|
+
- Special token handling with two removal methods (Simple, AhoCorasick)
|
|
17
|
+
- Model save/load functionality
|
|
18
|
+
- Encoding and decoding capabilities
|
|
19
|
+
- Token type tracking (WORD, SUBWORD, SPECIALTOKEN)
|
|
20
|
+
- Python bindings via PyO3
|
|
21
|
+
- Comprehensive documentation and examples
|
|
22
|
+
- Dual licensing (MIT OR Apache-2.0)
|
|
23
|
+
|
|
24
|
+
### Features
|
|
25
|
+
- Fast training with Rayon parallelization
|
|
26
|
+
- Efficient token table with Arc-based string sharing
|
|
27
|
+
- JSON serialization for trained models
|
|
28
|
+
- Configurable vocabulary size
|
|
29
|
+
- Support for special tokens like `<pad>`, `<mask>`, etc.
|
|
30
|
+
|
|
31
|
+
[Unreleased]: https://github.com/faisalishfaq2005/FIBpeTokenizer/compare/v0.1.0...HEAD
|
|
32
|
+
[0.1.0]: https://github.com/faisalishfaq2005/FIBpeTokenizer/releases/tag/v0.1.0
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Contributing to FIBpeTokenizer
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to FIBpeTokenizer! This document provides guidelines for contributing to the project.
|
|
4
|
+
|
|
5
|
+
## Code of Conduct
|
|
6
|
+
|
|
7
|
+
Be respectful, inclusive, and constructive in all interactions.
|
|
8
|
+
|
|
9
|
+
## How to Contribute
|
|
10
|
+
|
|
11
|
+
### Reporting Bugs
|
|
12
|
+
|
|
13
|
+
If you find a bug, please open an issue on GitHub with:
|
|
14
|
+
- A clear, descriptive title
|
|
15
|
+
- Steps to reproduce the issue
|
|
16
|
+
- Expected behavior vs actual behavior
|
|
17
|
+
- Your environment (OS, Rust version, Python version if applicable)
|
|
18
|
+
- Any relevant code samples or error messages
|
|
19
|
+
|
|
20
|
+
### Suggesting Features
|
|
21
|
+
|
|
22
|
+
Feature requests are welcome! Please open an issue describing:
|
|
23
|
+
- The problem you're trying to solve
|
|
24
|
+
- Your proposed solution
|
|
25
|
+
- Why this feature would be useful to others
|
|
26
|
+
|
|
27
|
+
### Pull Requests
|
|
28
|
+
|
|
29
|
+
1. **Fork the repository** and create a branch from `main`
|
|
30
|
+
2. **Make your changes** following the coding standards below
|
|
31
|
+
3. **Add tests** for any new functionality
|
|
32
|
+
4. **Update documentation** including doc comments and README if needed
|
|
33
|
+
5. **Run tests and checks**:
|
|
34
|
+
```bash
|
|
35
|
+
cargo test
|
|
36
|
+
cargo clippy
|
|
37
|
+
cargo fmt
|
|
38
|
+
```
|
|
39
|
+
6. **Commit your changes** with clear, descriptive commit messages
|
|
40
|
+
7. **Push to your fork** and submit a pull request
|
|
41
|
+
|
|
42
|
+
## Development Setup
|
|
43
|
+
|
|
44
|
+
1. Clone the repository:
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://github.com/yourusername/fibpetokenizer.git
|
|
47
|
+
cd fibpetokenizer
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
2. Build the project:
|
|
51
|
+
```bash
|
|
52
|
+
cargo build
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
3. Run tests:
|
|
56
|
+
```bash
|
|
57
|
+
cargo test
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
4. For Python development:
|
|
61
|
+
```bash
|
|
62
|
+
pip install maturin
|
|
63
|
+
maturin develop --features python
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Coding Standards
|
|
67
|
+
|
|
68
|
+
### Rust Code
|
|
69
|
+
|
|
70
|
+
- Follow the [Rust API Guidelines](https://rust-lang.github.io/api-guidelines/)
|
|
71
|
+
- Use `cargo fmt` to format code
|
|
72
|
+
- Use `cargo clippy` to catch common mistakes
|
|
73
|
+
- Add documentation comments (`///`) for all public items
|
|
74
|
+
- Write descriptive variable names
|
|
75
|
+
- Keep functions focused and reasonably sized
|
|
76
|
+
|
|
77
|
+
### Python Bindings
|
|
78
|
+
|
|
79
|
+
- Follow [PEP 8](https://peps.python.org/pep-0008/) style guide
|
|
80
|
+
- Provide type hints where applicable
|
|
81
|
+
- Document all public API functions
|
|
82
|
+
|
|
83
|
+
### Documentation
|
|
84
|
+
|
|
85
|
+
- Use clear, concise language
|
|
86
|
+
- Include examples in doc comments
|
|
87
|
+
- Update README.md for user-facing changes
|
|
88
|
+
- Add inline comments for complex logic
|
|
89
|
+
|
|
90
|
+
### Testing
|
|
91
|
+
|
|
92
|
+
- Write unit tests for new functions
|
|
93
|
+
- Add integration tests for new features
|
|
94
|
+
- Ensure all tests pass before submitting PR
|
|
95
|
+
- Aim for good test coverage
|
|
96
|
+
|
|
97
|
+
## Project Structure
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
fibpetokenizer/
|
|
101
|
+
├── src/
|
|
102
|
+
│ ├── lib.rs # Library entry point
|
|
103
|
+
│ ├── bpe_tokenizer.rs # Main tokenizer implementation
|
|
104
|
+
│ ├── pretokenizer.rs # Pre-tokenization strategies
|
|
105
|
+
│ ├── tokenizer_tables.rs # Token table data structure
|
|
106
|
+
│ ├── python_wrapper.rs # PyO3 Python bindings
|
|
107
|
+
│ └── main.rs # Binary entry point (examples)
|
|
108
|
+
├── examples/ # Usage examples
|
|
109
|
+
├── Cargo.toml # Rust dependencies and metadata
|
|
110
|
+
├── pyproject.toml # Python package configuration
|
|
111
|
+
└── README.md # User documentation
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Areas for Contribution
|
|
115
|
+
|
|
116
|
+
Here are some areas where contributions are especially welcome:
|
|
117
|
+
|
|
118
|
+
### High Priority
|
|
119
|
+
- Additional pre-tokenization strategies
|
|
120
|
+
- Performance optimizations
|
|
121
|
+
- More comprehensive tests
|
|
122
|
+
- Better error handling
|
|
123
|
+
|
|
124
|
+
### Medium Priority
|
|
125
|
+
- Additional examples
|
|
126
|
+
- Serialization format improvements
|
|
127
|
+
- CLI tool for training tokenizers
|
|
128
|
+
- Benchmarking suite
|
|
129
|
+
|
|
130
|
+
### Documentation
|
|
131
|
+
- Tutorial documentation
|
|
132
|
+
- API reference improvements
|
|
133
|
+
- Translation of documentation
|
|
134
|
+
|
|
135
|
+
## Questions?
|
|
136
|
+
|
|
137
|
+
If you have questions about contributing, feel free to:
|
|
138
|
+
- Open an issue for discussion
|
|
139
|
+
- Reach out to maintainers
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
By contributing to FIBpeTokenizer, you agree that your contributions will be licensed under the same license as the project (MIT OR Apache-2.0).
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
Thank you for contributing to FIBpeTokenizer! 🎉
|
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
# This file is automatically @generated by Cargo.
|
|
2
|
+
# It is not intended for manual editing.
|
|
3
|
+
version = 4
|
|
4
|
+
|
|
5
|
+
[[package]]
|
|
6
|
+
name = "aho-corasick"
|
|
7
|
+
version = "1.1.3"
|
|
8
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
9
|
+
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"memchr",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[[package]]
|
|
15
|
+
name = "autocfg"
|
|
16
|
+
version = "1.5.0"
|
|
17
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
18
|
+
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
|
19
|
+
|
|
20
|
+
[[package]]
|
|
21
|
+
name = "bitflags"
|
|
22
|
+
version = "2.9.4"
|
|
23
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
24
|
+
checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
|
|
25
|
+
|
|
26
|
+
[[package]]
|
|
27
|
+
name = "cfg-if"
|
|
28
|
+
version = "1.0.3"
|
|
29
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
30
|
+
checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
|
|
31
|
+
|
|
32
|
+
[[package]]
|
|
33
|
+
name = "crossbeam-deque"
|
|
34
|
+
version = "0.8.6"
|
|
35
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
36
|
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
|
37
|
+
dependencies = [
|
|
38
|
+
"crossbeam-epoch",
|
|
39
|
+
"crossbeam-utils",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[[package]]
|
|
43
|
+
name = "crossbeam-epoch"
|
|
44
|
+
version = "0.9.18"
|
|
45
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
46
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
|
47
|
+
dependencies = [
|
|
48
|
+
"crossbeam-utils",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[[package]]
|
|
52
|
+
name = "crossbeam-utils"
|
|
53
|
+
version = "0.8.21"
|
|
54
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
55
|
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
|
56
|
+
|
|
57
|
+
[[package]]
|
|
58
|
+
name = "either"
|
|
59
|
+
version = "1.15.0"
|
|
60
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
61
|
+
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
|
62
|
+
|
|
63
|
+
[[package]]
|
|
64
|
+
name = "fibpetokenizer"
|
|
65
|
+
version = "0.1.0"
|
|
66
|
+
dependencies = [
|
|
67
|
+
"aho-corasick",
|
|
68
|
+
"pyo3",
|
|
69
|
+
"rayon",
|
|
70
|
+
"serde",
|
|
71
|
+
"serde_json",
|
|
72
|
+
"thiserror",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
[[package]]
|
|
76
|
+
name = "heck"
|
|
77
|
+
version = "0.4.1"
|
|
78
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
79
|
+
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
|
80
|
+
|
|
81
|
+
[[package]]
|
|
82
|
+
name = "indoc"
|
|
83
|
+
version = "2.0.6"
|
|
84
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
85
|
+
checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
|
|
86
|
+
|
|
87
|
+
[[package]]
|
|
88
|
+
name = "itoa"
|
|
89
|
+
version = "1.0.15"
|
|
90
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
91
|
+
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
|
92
|
+
|
|
93
|
+
[[package]]
|
|
94
|
+
name = "libc"
|
|
95
|
+
version = "0.2.176"
|
|
96
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
97
|
+
checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174"
|
|
98
|
+
|
|
99
|
+
[[package]]
|
|
100
|
+
name = "lock_api"
|
|
101
|
+
version = "0.4.13"
|
|
102
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
103
|
+
checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
|
|
104
|
+
dependencies = [
|
|
105
|
+
"autocfg",
|
|
106
|
+
"scopeguard",
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
[[package]]
|
|
110
|
+
name = "memchr"
|
|
111
|
+
version = "2.7.5"
|
|
112
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
113
|
+
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
|
|
114
|
+
|
|
115
|
+
[[package]]
|
|
116
|
+
name = "memoffset"
|
|
117
|
+
version = "0.9.1"
|
|
118
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
119
|
+
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
|
|
120
|
+
dependencies = [
|
|
121
|
+
"autocfg",
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
[[package]]
|
|
125
|
+
name = "once_cell"
|
|
126
|
+
version = "1.21.3"
|
|
127
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
128
|
+
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
|
129
|
+
|
|
130
|
+
[[package]]
|
|
131
|
+
name = "parking_lot"
|
|
132
|
+
version = "0.12.4"
|
|
133
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
134
|
+
checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13"
|
|
135
|
+
dependencies = [
|
|
136
|
+
"lock_api",
|
|
137
|
+
"parking_lot_core",
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
[[package]]
|
|
141
|
+
name = "parking_lot_core"
|
|
142
|
+
version = "0.9.11"
|
|
143
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
144
|
+
checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5"
|
|
145
|
+
dependencies = [
|
|
146
|
+
"cfg-if",
|
|
147
|
+
"libc",
|
|
148
|
+
"redox_syscall",
|
|
149
|
+
"smallvec",
|
|
150
|
+
"windows-targets",
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
[[package]]
|
|
154
|
+
name = "portable-atomic"
|
|
155
|
+
version = "1.11.1"
|
|
156
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
157
|
+
checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
|
|
158
|
+
|
|
159
|
+
[[package]]
|
|
160
|
+
name = "proc-macro2"
|
|
161
|
+
version = "1.0.101"
|
|
162
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
163
|
+
checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
|
|
164
|
+
dependencies = [
|
|
165
|
+
"unicode-ident",
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
[[package]]
|
|
169
|
+
name = "pyo3"
|
|
170
|
+
version = "0.20.3"
|
|
171
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
172
|
+
checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233"
|
|
173
|
+
dependencies = [
|
|
174
|
+
"cfg-if",
|
|
175
|
+
"indoc",
|
|
176
|
+
"libc",
|
|
177
|
+
"memoffset",
|
|
178
|
+
"parking_lot",
|
|
179
|
+
"portable-atomic",
|
|
180
|
+
"pyo3-build-config",
|
|
181
|
+
"pyo3-ffi",
|
|
182
|
+
"pyo3-macros",
|
|
183
|
+
"unindent",
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
[[package]]
|
|
187
|
+
name = "pyo3-build-config"
|
|
188
|
+
version = "0.20.3"
|
|
189
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
190
|
+
checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7"
|
|
191
|
+
dependencies = [
|
|
192
|
+
"once_cell",
|
|
193
|
+
"target-lexicon",
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
[[package]]
|
|
197
|
+
name = "pyo3-ffi"
|
|
198
|
+
version = "0.20.3"
|
|
199
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
200
|
+
checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa"
|
|
201
|
+
dependencies = [
|
|
202
|
+
"libc",
|
|
203
|
+
"pyo3-build-config",
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
[[package]]
|
|
207
|
+
name = "pyo3-macros"
|
|
208
|
+
version = "0.20.3"
|
|
209
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
210
|
+
checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158"
|
|
211
|
+
dependencies = [
|
|
212
|
+
"proc-macro2",
|
|
213
|
+
"pyo3-macros-backend",
|
|
214
|
+
"quote",
|
|
215
|
+
"syn",
|
|
216
|
+
]
|
|
217
|
+
|
|
218
|
+
[[package]]
|
|
219
|
+
name = "pyo3-macros-backend"
|
|
220
|
+
version = "0.20.3"
|
|
221
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
222
|
+
checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185"
|
|
223
|
+
dependencies = [
|
|
224
|
+
"heck",
|
|
225
|
+
"proc-macro2",
|
|
226
|
+
"pyo3-build-config",
|
|
227
|
+
"quote",
|
|
228
|
+
"syn",
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
[[package]]
|
|
232
|
+
name = "quote"
|
|
233
|
+
version = "1.0.40"
|
|
234
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
235
|
+
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
|
|
236
|
+
dependencies = [
|
|
237
|
+
"proc-macro2",
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
[[package]]
|
|
241
|
+
name = "rayon"
|
|
242
|
+
version = "1.10.0"
|
|
243
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
244
|
+
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
|
245
|
+
dependencies = [
|
|
246
|
+
"either",
|
|
247
|
+
"rayon-core",
|
|
248
|
+
]
|
|
249
|
+
|
|
250
|
+
[[package]]
|
|
251
|
+
name = "rayon-core"
|
|
252
|
+
version = "1.12.1"
|
|
253
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
254
|
+
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
|
255
|
+
dependencies = [
|
|
256
|
+
"crossbeam-deque",
|
|
257
|
+
"crossbeam-utils",
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
[[package]]
|
|
261
|
+
name = "redox_syscall"
|
|
262
|
+
version = "0.5.17"
|
|
263
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
264
|
+
checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77"
|
|
265
|
+
dependencies = [
|
|
266
|
+
"bitflags",
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
[[package]]
|
|
270
|
+
name = "ryu"
|
|
271
|
+
version = "1.0.20"
|
|
272
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
273
|
+
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
|
274
|
+
|
|
275
|
+
[[package]]
|
|
276
|
+
name = "scopeguard"
|
|
277
|
+
version = "1.2.0"
|
|
278
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
279
|
+
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
|
280
|
+
|
|
281
|
+
[[package]]
|
|
282
|
+
name = "serde"
|
|
283
|
+
version = "1.0.226"
|
|
284
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
285
|
+
checksum = "0dca6411025b24b60bfa7ec1fe1f8e710ac09782dca409ee8237ba74b51295fd"
|
|
286
|
+
dependencies = [
|
|
287
|
+
"serde_core",
|
|
288
|
+
"serde_derive",
|
|
289
|
+
]
|
|
290
|
+
|
|
291
|
+
[[package]]
|
|
292
|
+
name = "serde_core"
|
|
293
|
+
version = "1.0.226"
|
|
294
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
295
|
+
checksum = "ba2ba63999edb9dac981fb34b3e5c0d111a69b0924e253ed29d83f7c99e966a4"
|
|
296
|
+
dependencies = [
|
|
297
|
+
"serde_derive",
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
[[package]]
|
|
301
|
+
name = "serde_derive"
|
|
302
|
+
version = "1.0.226"
|
|
303
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
304
|
+
checksum = "8db53ae22f34573731bafa1db20f04027b2d25e02d8205921b569171699cdb33"
|
|
305
|
+
dependencies = [
|
|
306
|
+
"proc-macro2",
|
|
307
|
+
"quote",
|
|
308
|
+
"syn",
|
|
309
|
+
]
|
|
310
|
+
|
|
311
|
+
[[package]]
|
|
312
|
+
name = "serde_json"
|
|
313
|
+
version = "1.0.145"
|
|
314
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
315
|
+
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
|
|
316
|
+
dependencies = [
|
|
317
|
+
"itoa",
|
|
318
|
+
"memchr",
|
|
319
|
+
"ryu",
|
|
320
|
+
"serde",
|
|
321
|
+
"serde_core",
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
[[package]]
|
|
325
|
+
name = "smallvec"
|
|
326
|
+
version = "1.15.1"
|
|
327
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
328
|
+
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
|
329
|
+
|
|
330
|
+
[[package]]
|
|
331
|
+
name = "syn"
|
|
332
|
+
version = "2.0.106"
|
|
333
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
334
|
+
checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
|
|
335
|
+
dependencies = [
|
|
336
|
+
"proc-macro2",
|
|
337
|
+
"quote",
|
|
338
|
+
"unicode-ident",
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
[[package]]
|
|
342
|
+
name = "target-lexicon"
|
|
343
|
+
version = "0.12.16"
|
|
344
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
345
|
+
checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
|
|
346
|
+
|
|
347
|
+
[[package]]
|
|
348
|
+
name = "thiserror"
|
|
349
|
+
version = "1.0.69"
|
|
350
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
351
|
+
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
|
352
|
+
dependencies = [
|
|
353
|
+
"thiserror-impl",
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
[[package]]
|
|
357
|
+
name = "thiserror-impl"
|
|
358
|
+
version = "1.0.69"
|
|
359
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
360
|
+
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
|
361
|
+
dependencies = [
|
|
362
|
+
"proc-macro2",
|
|
363
|
+
"quote",
|
|
364
|
+
"syn",
|
|
365
|
+
]
|
|
366
|
+
|
|
367
|
+
[[package]]
|
|
368
|
+
name = "unicode-ident"
|
|
369
|
+
version = "1.0.19"
|
|
370
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
371
|
+
checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
|
|
372
|
+
|
|
373
|
+
[[package]]
|
|
374
|
+
name = "unindent"
|
|
375
|
+
version = "0.2.4"
|
|
376
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
377
|
+
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
|
|
378
|
+
|
|
379
|
+
[[package]]
|
|
380
|
+
name = "windows-targets"
|
|
381
|
+
version = "0.52.6"
|
|
382
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
383
|
+
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
|
384
|
+
dependencies = [
|
|
385
|
+
"windows_aarch64_gnullvm",
|
|
386
|
+
"windows_aarch64_msvc",
|
|
387
|
+
"windows_i686_gnu",
|
|
388
|
+
"windows_i686_gnullvm",
|
|
389
|
+
"windows_i686_msvc",
|
|
390
|
+
"windows_x86_64_gnu",
|
|
391
|
+
"windows_x86_64_gnullvm",
|
|
392
|
+
"windows_x86_64_msvc",
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
[[package]]
|
|
396
|
+
name = "windows_aarch64_gnullvm"
|
|
397
|
+
version = "0.52.6"
|
|
398
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
399
|
+
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
|
400
|
+
|
|
401
|
+
[[package]]
|
|
402
|
+
name = "windows_aarch64_msvc"
|
|
403
|
+
version = "0.52.6"
|
|
404
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
405
|
+
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
|
406
|
+
|
|
407
|
+
[[package]]
|
|
408
|
+
name = "windows_i686_gnu"
|
|
409
|
+
version = "0.52.6"
|
|
410
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
411
|
+
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
|
412
|
+
|
|
413
|
+
[[package]]
|
|
414
|
+
name = "windows_i686_gnullvm"
|
|
415
|
+
version = "0.52.6"
|
|
416
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
417
|
+
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
|
418
|
+
|
|
419
|
+
[[package]]
|
|
420
|
+
name = "windows_i686_msvc"
|
|
421
|
+
version = "0.52.6"
|
|
422
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
423
|
+
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
|
424
|
+
|
|
425
|
+
[[package]]
|
|
426
|
+
name = "windows_x86_64_gnu"
|
|
427
|
+
version = "0.52.6"
|
|
428
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
429
|
+
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
|
430
|
+
|
|
431
|
+
[[package]]
|
|
432
|
+
name = "windows_x86_64_gnullvm"
|
|
433
|
+
version = "0.52.6"
|
|
434
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
435
|
+
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
|
436
|
+
|
|
437
|
+
[[package]]
|
|
438
|
+
name = "windows_x86_64_msvc"
|
|
439
|
+
version = "0.52.6"
|
|
440
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
441
|
+
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "fibpetokenizer"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
authors = ["Muhammad Faisal Ishfaq <faisalishfaq59@gmail.com>"]
|
|
6
|
+
description = "A blazing fast Byte Pair Encoding (BPE) tokenizer library with Python bindings"
|
|
7
|
+
license = "MIT OR Apache-2.0"
|
|
8
|
+
repository = "https://github.com/faisalishfaq2005/FIBpeTokenizer"
|
|
9
|
+
homepage = "https://github.com/faisalishfaq2005/FIBpeTokenizer"
|
|
10
|
+
documentation = "https://docs.rs/FIBpeTokenizer"
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
keywords = ["tokenizer", "bpe", "nlp", "machine-learning", "text-processing"]
|
|
13
|
+
categories = ["text-processing", "algorithms", "data-structures"]
|
|
14
|
+
|
|
15
|
+
[lib]
|
|
16
|
+
name = "fibpetokenizer"
|
|
17
|
+
path = "src/lib.rs"
|
|
18
|
+
crate-type = ["cdylib", "rlib"]
|
|
19
|
+
|
|
20
|
+
[[bin]]
|
|
21
|
+
name = "fibpetokenizer"
|
|
22
|
+
path = "src/main.rs"
|
|
23
|
+
|
|
24
|
+
[dependencies]
|
|
25
|
+
rayon = "1.8"
|
|
26
|
+
aho-corasick = "1.1"
|
|
27
|
+
thiserror = "1.0"
|
|
28
|
+
serde = { version = "1.0", features = ["derive", "rc"] }
|
|
29
|
+
serde_json = "1.0"
|
|
30
|
+
pyo3 = { version = "0.20", features = ["extension-module"], optional = true }
|
|
31
|
+
|
|
32
|
+
[features]
|
|
33
|
+
default = []
|
|
34
|
+
python = ["pyo3"]
|
|
35
|
+
|
|
36
|
+
[profile.release]
|
|
37
|
+
opt-level = 3
|
|
38
|
+
lto = true
|