gotoken 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gotoken-0.1.0/Cargo.lock +341 -0
- gotoken-0.1.0/Cargo.toml +31 -0
- gotoken-0.1.0/PKG-INFO +9 -0
- gotoken-0.1.0/patch_lexar.py +199 -0
- gotoken-0.1.0/patch_lib_python.py +44 -0
- gotoken-0.1.0/pyproject.toml +23 -0
- gotoken-0.1.0/src/encoder.rs +572 -0
- gotoken-0.1.0/src/lexer.rs +671 -0
- gotoken-0.1.0/src/lib.rs +413 -0
- gotoken-0.1.0/src/python.rs +128 -0
- gotoken-0.1.0/src/vocab.rs +654 -0
- gotoken-0.1.0/test_gotoken.py +221 -0
gotoken-0.1.0/Cargo.lock
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
# This file is automatically @generated by Cargo.
|
|
2
|
+
# It is not intended for manual editing.
|
|
3
|
+
version = 4
|
|
4
|
+
|
|
5
|
+
[[package]]
|
|
6
|
+
name = "aho-corasick"
|
|
7
|
+
version = "1.1.4"
|
|
8
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
9
|
+
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"memchr",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[[package]]
|
|
15
|
+
name = "autocfg"
|
|
16
|
+
version = "1.5.1"
|
|
17
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
18
|
+
checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53"
|
|
19
|
+
|
|
20
|
+
[[package]]
|
|
21
|
+
name = "cfg-if"
|
|
22
|
+
version = "1.0.4"
|
|
23
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
24
|
+
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
|
25
|
+
|
|
26
|
+
[[package]]
|
|
27
|
+
name = "crossbeam-deque"
|
|
28
|
+
version = "0.8.6"
|
|
29
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
30
|
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
|
31
|
+
dependencies = [
|
|
32
|
+
"crossbeam-epoch",
|
|
33
|
+
"crossbeam-utils",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[[package]]
|
|
37
|
+
name = "crossbeam-epoch"
|
|
38
|
+
version = "0.9.18"
|
|
39
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
40
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
|
41
|
+
dependencies = [
|
|
42
|
+
"crossbeam-utils",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[[package]]
|
|
46
|
+
name = "crossbeam-utils"
|
|
47
|
+
version = "0.8.21"
|
|
48
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
49
|
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
|
50
|
+
|
|
51
|
+
[[package]]
|
|
52
|
+
name = "either"
|
|
53
|
+
version = "1.16.0"
|
|
54
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
55
|
+
checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
|
|
56
|
+
|
|
57
|
+
[[package]]
|
|
58
|
+
name = "gotoken"
|
|
59
|
+
version = "0.1.0"
|
|
60
|
+
dependencies = [
|
|
61
|
+
"phf",
|
|
62
|
+
"pyo3",
|
|
63
|
+
"rayon",
|
|
64
|
+
"regex",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
[[package]]
|
|
68
|
+
name = "heck"
|
|
69
|
+
version = "0.5.0"
|
|
70
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
71
|
+
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
|
72
|
+
|
|
73
|
+
[[package]]
|
|
74
|
+
name = "indoc"
|
|
75
|
+
version = "2.0.7"
|
|
76
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
77
|
+
checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
|
|
78
|
+
dependencies = [
|
|
79
|
+
"rustversion",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
[[package]]
|
|
83
|
+
name = "libc"
|
|
84
|
+
version = "0.2.186"
|
|
85
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
86
|
+
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
|
87
|
+
|
|
88
|
+
[[package]]
|
|
89
|
+
name = "memchr"
|
|
90
|
+
version = "2.8.2"
|
|
91
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
92
|
+
checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4"
|
|
93
|
+
|
|
94
|
+
[[package]]
|
|
95
|
+
name = "memoffset"
|
|
96
|
+
version = "0.9.1"
|
|
97
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
98
|
+
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
|
|
99
|
+
dependencies = [
|
|
100
|
+
"autocfg",
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
[[package]]
|
|
104
|
+
name = "once_cell"
|
|
105
|
+
version = "1.21.4"
|
|
106
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
107
|
+
checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
|
|
108
|
+
|
|
109
|
+
[[package]]
|
|
110
|
+
name = "phf"
|
|
111
|
+
version = "0.11.3"
|
|
112
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
113
|
+
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
|
|
114
|
+
dependencies = [
|
|
115
|
+
"phf_macros",
|
|
116
|
+
"phf_shared",
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
[[package]]
|
|
120
|
+
name = "phf_generator"
|
|
121
|
+
version = "0.11.3"
|
|
122
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
123
|
+
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
|
124
|
+
dependencies = [
|
|
125
|
+
"phf_shared",
|
|
126
|
+
"rand",
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
[[package]]
|
|
130
|
+
name = "phf_macros"
|
|
131
|
+
version = "0.11.3"
|
|
132
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
133
|
+
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
|
|
134
|
+
dependencies = [
|
|
135
|
+
"phf_generator",
|
|
136
|
+
"phf_shared",
|
|
137
|
+
"proc-macro2",
|
|
138
|
+
"quote",
|
|
139
|
+
"syn",
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
[[package]]
|
|
143
|
+
name = "phf_shared"
|
|
144
|
+
version = "0.11.3"
|
|
145
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
146
|
+
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
|
|
147
|
+
dependencies = [
|
|
148
|
+
"siphasher",
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
[[package]]
|
|
152
|
+
name = "portable-atomic"
|
|
153
|
+
version = "1.13.1"
|
|
154
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
155
|
+
checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
|
|
156
|
+
|
|
157
|
+
[[package]]
|
|
158
|
+
name = "proc-macro2"
|
|
159
|
+
version = "1.0.106"
|
|
160
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
161
|
+
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
|
|
162
|
+
dependencies = [
|
|
163
|
+
"unicode-ident",
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
[[package]]
|
|
167
|
+
name = "pyo3"
|
|
168
|
+
version = "0.22.6"
|
|
169
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
170
|
+
checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
|
|
171
|
+
dependencies = [
|
|
172
|
+
"cfg-if",
|
|
173
|
+
"indoc",
|
|
174
|
+
"libc",
|
|
175
|
+
"memoffset",
|
|
176
|
+
"once_cell",
|
|
177
|
+
"portable-atomic",
|
|
178
|
+
"pyo3-build-config",
|
|
179
|
+
"pyo3-ffi",
|
|
180
|
+
"pyo3-macros",
|
|
181
|
+
"unindent",
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
[[package]]
|
|
185
|
+
name = "pyo3-build-config"
|
|
186
|
+
version = "0.22.6"
|
|
187
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
188
|
+
checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
|
|
189
|
+
dependencies = [
|
|
190
|
+
"once_cell",
|
|
191
|
+
"target-lexicon",
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
[[package]]
|
|
195
|
+
name = "pyo3-ffi"
|
|
196
|
+
version = "0.22.6"
|
|
197
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
198
|
+
checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
|
|
199
|
+
dependencies = [
|
|
200
|
+
"libc",
|
|
201
|
+
"pyo3-build-config",
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
[[package]]
|
|
205
|
+
name = "pyo3-macros"
|
|
206
|
+
version = "0.22.6"
|
|
207
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
208
|
+
checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
|
|
209
|
+
dependencies = [
|
|
210
|
+
"proc-macro2",
|
|
211
|
+
"pyo3-macros-backend",
|
|
212
|
+
"quote",
|
|
213
|
+
"syn",
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
[[package]]
|
|
217
|
+
name = "pyo3-macros-backend"
|
|
218
|
+
version = "0.22.6"
|
|
219
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
220
|
+
checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
|
|
221
|
+
dependencies = [
|
|
222
|
+
"heck",
|
|
223
|
+
"proc-macro2",
|
|
224
|
+
"pyo3-build-config",
|
|
225
|
+
"quote",
|
|
226
|
+
"syn",
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
[[package]]
|
|
230
|
+
name = "quote"
|
|
231
|
+
version = "1.0.45"
|
|
232
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
233
|
+
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
|
|
234
|
+
dependencies = [
|
|
235
|
+
"proc-macro2",
|
|
236
|
+
]
|
|
237
|
+
|
|
238
|
+
[[package]]
|
|
239
|
+
name = "rand"
|
|
240
|
+
version = "0.8.6"
|
|
241
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
242
|
+
checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
|
|
243
|
+
dependencies = [
|
|
244
|
+
"rand_core",
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
[[package]]
|
|
248
|
+
name = "rand_core"
|
|
249
|
+
version = "0.6.4"
|
|
250
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
251
|
+
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
|
252
|
+
|
|
253
|
+
[[package]]
|
|
254
|
+
name = "rayon"
|
|
255
|
+
version = "1.12.0"
|
|
256
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
257
|
+
checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
|
|
258
|
+
dependencies = [
|
|
259
|
+
"either",
|
|
260
|
+
"rayon-core",
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
[[package]]
|
|
264
|
+
name = "rayon-core"
|
|
265
|
+
version = "1.13.0"
|
|
266
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
267
|
+
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
|
|
268
|
+
dependencies = [
|
|
269
|
+
"crossbeam-deque",
|
|
270
|
+
"crossbeam-utils",
|
|
271
|
+
]
|
|
272
|
+
|
|
273
|
+
[[package]]
|
|
274
|
+
name = "regex"
|
|
275
|
+
version = "1.12.4"
|
|
276
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
277
|
+
checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba"
|
|
278
|
+
dependencies = [
|
|
279
|
+
"aho-corasick",
|
|
280
|
+
"memchr",
|
|
281
|
+
"regex-automata",
|
|
282
|
+
"regex-syntax",
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
[[package]]
|
|
286
|
+
name = "regex-automata"
|
|
287
|
+
version = "0.4.14"
|
|
288
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
289
|
+
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
|
|
290
|
+
dependencies = [
|
|
291
|
+
"aho-corasick",
|
|
292
|
+
"memchr",
|
|
293
|
+
"regex-syntax",
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
[[package]]
|
|
297
|
+
name = "regex-syntax"
|
|
298
|
+
version = "0.8.11"
|
|
299
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
300
|
+
checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
|
|
301
|
+
|
|
302
|
+
[[package]]
|
|
303
|
+
name = "rustversion"
|
|
304
|
+
version = "1.0.22"
|
|
305
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
306
|
+
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
|
307
|
+
|
|
308
|
+
[[package]]
|
|
309
|
+
name = "siphasher"
|
|
310
|
+
version = "1.0.3"
|
|
311
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
312
|
+
checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649"
|
|
313
|
+
|
|
314
|
+
[[package]]
|
|
315
|
+
name = "syn"
|
|
316
|
+
version = "2.0.117"
|
|
317
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
318
|
+
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
|
|
319
|
+
dependencies = [
|
|
320
|
+
"proc-macro2",
|
|
321
|
+
"quote",
|
|
322
|
+
"unicode-ident",
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
[[package]]
|
|
326
|
+
name = "target-lexicon"
|
|
327
|
+
version = "0.12.16"
|
|
328
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
329
|
+
checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
|
|
330
|
+
|
|
331
|
+
[[package]]
|
|
332
|
+
name = "unicode-ident"
|
|
333
|
+
version = "1.0.24"
|
|
334
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
335
|
+
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
|
336
|
+
|
|
337
|
+
[[package]]
|
|
338
|
+
name = "unindent"
|
|
339
|
+
version = "0.2.4"
|
|
340
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
341
|
+
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
|
gotoken-0.1.0/Cargo.toml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "gotoken"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
description = "Syntax-aware tokenizer for Bash and formal languages"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
|
|
8
|
+
[lib]
|
|
9
|
+
name = "gotoken"
|
|
10
|
+
crate-types = ["rlib", "cdylib"]
|
|
11
|
+
|
|
12
|
+
[features]
|
|
13
|
+
default = ["rayon"]
|
|
14
|
+
rayon = ["dep:rayon"]
|
|
15
|
+
python = ["dep:pyo3"]
|
|
16
|
+
|
|
17
|
+
[dependencies]
|
|
18
|
+
phf = { version = "0.11", features = ["macros"] }
|
|
19
|
+
regex = "1.10"
|
|
20
|
+
rayon = { version = "1.10", optional = true }
|
|
21
|
+
# 0.22.x supporta Python 3.13
|
|
22
|
+
pyo3 = { version = "0.22", features = ["extension-module"], optional = true }
|
|
23
|
+
|
|
24
|
+
[dev-dependencies]
|
|
25
|
+
|
|
26
|
+
[profile.release]
|
|
27
|
+
opt-level = 3
|
|
28
|
+
lto = "fat"
|
|
29
|
+
codegen-units = 1
|
|
30
|
+
panic = "abort"
|
|
31
|
+
strip = "symbols"
|
gotoken-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gotoken
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
7
|
+
Summary: Syntax-aware Bash tokenizer — Rust core, Python bindings
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Python: >=3.9
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Fix 2 bug in src/lexer.rs:
|
|
4
|
+
1. try_dquote ora gestisce anche UnclosedQuote (stessa firma di try_squote)
|
|
5
|
+
2. Pattern operator: || e |& devono precedere | nell'alternazione
|
|
6
|
+
3. Il loop principale smista " a try_dquote e ' a try_squote
|
|
7
|
+
|
|
8
|
+
Esegui da ~/gotoken/: python3 patch_lexer2.py
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
path = "src/lexer.rs"
|
|
12
|
+
with open(path, "r") as f:
|
|
13
|
+
src = f.read()
|
|
14
|
+
|
|
15
|
+
# ── Fix 1: regex operator — || e |& prima di | ─────────────────────────────
|
|
16
|
+
OLD_REGEX = (
|
|
17
|
+
r' operator: Regex::new(\n'
|
|
18
|
+
r' r"^(?:2>&1|&>>|&>|>>=|<<=|<<<|<<-|<<|>>|>\||\|&|&&|\|\|;;|;;|;|2>>|2>|<>|>>|>|<|&|\|)"\n'
|
|
19
|
+
r' )?,'
|
|
20
|
+
)
|
|
21
|
+
NEW_REGEX = (
|
|
22
|
+
r' operator: Regex::new(\n'
|
|
23
|
+
r' r"^(?:2>&1|&>>|&>|>>=|<<=|<<<|<<-|<<|>>|>\||&&|\|\||;;|\|&|;|2>>|2>|<>|>|<|&|\|)"\n'
|
|
24
|
+
r' )?,'
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Usiamo replace su stringa letterale
|
|
28
|
+
OLD_REGEX_LIT = ''' operator: Regex::new(
|
|
29
|
+
r"^(?:2>&1|&>>|&>|>>=|<<=|<<<|<<-|<<|>>|>\\|\\|&|&&|\\|\\|;;|;;|;|2>>|2>|<>|>>|>|<|&|\\|)"
|
|
30
|
+
)?,'''
|
|
31
|
+
|
|
32
|
+
# Leggiamo la riga esatta dal file
|
|
33
|
+
import re
|
|
34
|
+
op_match = re.search(r'operator: Regex::new\(\s*r"([^"]+)"', src)
|
|
35
|
+
if op_match:
|
|
36
|
+
old_pattern = op_match.group(1)
|
|
37
|
+
print(f"Pattern operatore trovato: {old_pattern}")
|
|
38
|
+
# Nuova alternazione: || e |& prima del singolo |
|
|
39
|
+
new_pattern = r"^(?:2>&1|&>>|&>|>>=|<<=|<<<|<<-|<<|>>|>\||&&|\|\||;;|\|&|;|2>>|2>|<>|>|<|&|\|)"
|
|
40
|
+
src = src.replace(
|
|
41
|
+
f'r"{old_pattern}"',
|
|
42
|
+
f'r"{new_pattern}"',
|
|
43
|
+
1
|
|
44
|
+
)
|
|
45
|
+
print("✓ Fix 1 applicato: priorità || e |& corretta nel pattern operator")
|
|
46
|
+
else:
|
|
47
|
+
print("WARN: pattern operator non trovato, Fix 1 saltato")
|
|
48
|
+
|
|
49
|
+
# ── Fix 2: try_dquote ora ritorna Option<Result<Token>> ────────────────────
|
|
50
|
+
|
|
51
|
+
OLD_DQUOTE = ''' /// Double-quoted string. Returns `None` if no opening `"` is present.
|
|
52
|
+
/// If `"` is present but unclosed the regex simply won\'t match, so we
|
|
53
|
+
/// detect it here and return an error via the `ZeroLengthFallback`
|
|
54
|
+
/// guard path — actually we surface it as `UnclosedQuote`.
|
|
55
|
+
fn try_dquote(&self, tail: &str, offset: usize) -> Option<Token> {
|
|
56
|
+
if !tail.starts_with(\'"\') {
|
|
57
|
+
return None;
|
|
58
|
+
}
|
|
59
|
+
// If the regex matches we have a closed string.
|
|
60
|
+
if let Some(m) = self.patterns.dquote.find(tail) {
|
|
61
|
+
let text = &tail[m.start()..m.end()];
|
|
62
|
+
return Some(Token::new(text, TokenKind::StringLiteral, offset));
|
|
63
|
+
}
|
|
64
|
+
// `"` found but no closing quote — we return a sentinel token whose
|
|
65
|
+
// `kind` is set to a special value; the caller checks this and
|
|
66
|
+
// converts it to an error via `transpose()`.
|
|
67
|
+
// We encode the error as a `ByteFallback` with len 0 as a signal —
|
|
68
|
+
// the caller sees `ZeroLengthFallback` ... actually cleaner to use
|
|
69
|
+
// a dedicated approach: return the error inline.
|
|
70
|
+
// We exploit the Option<Result<Token>> path: NOT this function.
|
|
71
|
+
// This function stays Option<Token>; unclosed " is caught below.
|
|
72
|
+
None
|
|
73
|
+
}'''
|
|
74
|
+
|
|
75
|
+
NEW_DQUOTE = ''' /// Double-quoted string. Returns `Option<Result<Token>>` so that
|
|
76
|
+
/// `UnclosedQuote` can bubble up when the closing `"` is missing.
|
|
77
|
+
fn try_dquote(
|
|
78
|
+
&self,
|
|
79
|
+
tail: &str,
|
|
80
|
+
offset: usize,
|
|
81
|
+
) -> Option<std::result::Result<Token, TokenizerError>> {
|
|
82
|
+
if !tail.starts_with(\'"\') {
|
|
83
|
+
return None;
|
|
84
|
+
}
|
|
85
|
+
if let Some(m) = self.patterns.dquote.find(tail) {
|
|
86
|
+
let text = &tail[m.start()..m.end()];
|
|
87
|
+
return Some(Ok(Token::new(text, TokenKind::StringLiteral, offset)));
|
|
88
|
+
}
|
|
89
|
+
// Opening `"` found but no closing quote.
|
|
90
|
+
Some(Err(TokenizerError::UnclosedQuote {
|
|
91
|
+
quote_char: \'"\',
|
|
92
|
+
opened_at: offset,
|
|
93
|
+
}))
|
|
94
|
+
}'''
|
|
95
|
+
|
|
96
|
+
if OLD_DQUOTE in src:
|
|
97
|
+
src = src.replace(OLD_DQUOTE, NEW_DQUOTE, 1)
|
|
98
|
+
print("✓ Fix 2 applicato: try_dquote ora ritorna Option<Result<Token>>")
|
|
99
|
+
else:
|
|
100
|
+
print("WARN: corpo try_dquote non trovato esattamente — applico fix alternativo")
|
|
101
|
+
# Approccio alternativo: cerca e rimpiazza con regex
|
|
102
|
+
src = re.sub(
|
|
103
|
+
r'fn try_dquote\(&self, tail: &str, offset: usize\) -> Option<Token> \{[^}]+\}',
|
|
104
|
+
'''fn try_dquote(
|
|
105
|
+
&self,
|
|
106
|
+
tail: &str,
|
|
107
|
+
offset: usize,
|
|
108
|
+
) -> Option<std::result::Result<Token, TokenizerError>> {
|
|
109
|
+
if !tail.starts_with(\'"\') {
|
|
110
|
+
return None;
|
|
111
|
+
}
|
|
112
|
+
if let Some(m) = self.patterns.dquote.find(tail) {
|
|
113
|
+
let text = &tail[m.start()..m.end()];
|
|
114
|
+
return Some(Ok(Token::new(text, TokenKind::StringLiteral, offset)));
|
|
115
|
+
}
|
|
116
|
+
Some(Err(TokenizerError::UnclosedQuote {
|
|
117
|
+
quote_char: \'"\',
|
|
118
|
+
opened_at: offset,
|
|
119
|
+
}))
|
|
120
|
+
}''',
|
|
121
|
+
src,
|
|
122
|
+
count=1,
|
|
123
|
+
flags=re.DOTALL
|
|
124
|
+
)
|
|
125
|
+
print("✓ Fix 2 applicato via regex fallback")
|
|
126
|
+
|
|
127
|
+
# ── Fix 3: loop principale — smista " a try_dquote, ' a try_squote ─────────
|
|
128
|
+
|
|
129
|
+
OLD_LOOP = ''' } else if tail.starts_with(\'"\') || tail.starts_with(\'\\'\') {
|
|
130
|
+
match self.try_squote(tail, cursor, input) {
|
|
131
|
+
Some(Ok(t)) => t,
|
|
132
|
+
Some(Err(e)) => return Err(e),
|
|
133
|
+
None => self.byte_fallback(tail, cursor),
|
|
134
|
+
}'''
|
|
135
|
+
|
|
136
|
+
NEW_LOOP = ''' } else if tail.starts_with(\'"\') {
|
|
137
|
+
match self.try_dquote(tail, cursor) {
|
|
138
|
+
Some(Ok(t)) => t,
|
|
139
|
+
Some(Err(e)) => return Err(e),
|
|
140
|
+
None => self.byte_fallback(tail, cursor),
|
|
141
|
+
}
|
|
142
|
+
} else if tail.starts_with(\'\\'\') {
|
|
143
|
+
match self.try_squote(tail, cursor, input) {
|
|
144
|
+
Some(Ok(t)) => t,
|
|
145
|
+
Some(Err(e)) => return Err(e),
|
|
146
|
+
None => self.byte_fallback(tail, cursor),
|
|
147
|
+
}'''
|
|
148
|
+
|
|
149
|
+
if OLD_LOOP in src:
|
|
150
|
+
src = src.replace(OLD_LOOP, NEW_LOOP, 1)
|
|
151
|
+
print("✓ Fix 3 applicato: loop smista \" a try_dquote e \\' a try_squote")
|
|
152
|
+
else:
|
|
153
|
+
print("WARN: pattern loop non trovato esattamente")
|
|
154
|
+
# Regex fallback
|
|
155
|
+
src = re.sub(
|
|
156
|
+
r"} else if tail\.starts_with\('\"'\) \|\| tail\.starts_with\('\\\\'\) \{.*?match self\.try_squote\(tail, cursor, input\) \{.*?None\s*=> self\.byte_fallback\(tail, cursor\),\s*\}",
|
|
157
|
+
"""} else if tail.starts_with('"') {
|
|
158
|
+
match self.try_dquote(tail, cursor) {
|
|
159
|
+
Some(Ok(t)) => t,
|
|
160
|
+
Some(Err(e)) => return Err(e),
|
|
161
|
+
None => self.byte_fallback(tail, cursor),
|
|
162
|
+
}
|
|
163
|
+
} else if tail.starts_with('\\'') {
|
|
164
|
+
match self.try_squote(tail, cursor, input) {
|
|
165
|
+
Some(Ok(t)) => t,
|
|
166
|
+
Some(Err(e)) => return Err(e),
|
|
167
|
+
None => self.byte_fallback(tail, cursor),
|
|
168
|
+
}""",
|
|
169
|
+
src,
|
|
170
|
+
count=1,
|
|
171
|
+
flags=re.DOTALL
|
|
172
|
+
)
|
|
173
|
+
print("✓ Fix 3 applicato via regex fallback")
|
|
174
|
+
|
|
175
|
+
# ── Fix 4: rimuovi il check " da try_squote (ora lo gestisce try_dquote) ───
|
|
176
|
+
|
|
177
|
+
OLD_SQUOTE_DQUOTE_CHECK = ''' if tail.starts_with(\'"\') {
|
|
178
|
+
// Double-quote unclosed detection (try_dquote returned None above)
|
|
179
|
+
if self.patterns.dquote.find(tail).is_none() {
|
|
180
|
+
return Some(Err(TokenizerError::UnclosedQuote {
|
|
181
|
+
quote_char: \'"\',
|
|
182
|
+
opened_at: offset,
|
|
183
|
+
}));
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
if !tail.starts_with(\'\\'\') {'''
|
|
187
|
+
|
|
188
|
+
NEW_SQUOTE_START = ''' if !tail.starts_with(\'\\'\') {'''
|
|
189
|
+
|
|
190
|
+
if OLD_SQUOTE_DQUOTE_CHECK in src:
|
|
191
|
+
src = src.replace(OLD_SQUOTE_DQUOTE_CHECK, NEW_SQUOTE_START, 1)
|
|
192
|
+
print("✓ Fix 4 applicato: rimosso check \" duplicato da try_squote")
|
|
193
|
+
else:
|
|
194
|
+
print("WARN: check \" in try_squote non trovato — potrebbe essere già rimosso")
|
|
195
|
+
|
|
196
|
+
with open(path, "w") as f:
|
|
197
|
+
f.write(src)
|
|
198
|
+
|
|
199
|
+
print("\nTutti i fix applicati. Esegui: cargo test 2>&1")
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Aggiunge i binding Python a src/lib.rs.
|
|
4
|
+
Esegui da ~/gotoken/: python3 patch_lib_python.py
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
path = "src/lib.rs"
|
|
8
|
+
with open(path, "r") as f:
|
|
9
|
+
src = f.read()
|
|
10
|
+
|
|
11
|
+
BLOCK = '''
|
|
12
|
+
// ── Python bindings (compilati solo con --features python) ───────────────────
|
|
13
|
+
#[cfg(feature = "python")]
|
|
14
|
+
pub mod python;
|
|
15
|
+
|
|
16
|
+
#[cfg(feature = "python")]
|
|
17
|
+
use pyo3::prelude::*;
|
|
18
|
+
|
|
19
|
+
/// Entry-point del modulo Python `gotoken`.
|
|
20
|
+
/// Chiamato automaticamente da maturin quando Python importa la libreria.
|
|
21
|
+
#[cfg(feature = "python")]
|
|
22
|
+
#[pymodule]
|
|
23
|
+
fn gotoken(m: &Bound<\'_, PyModule>) -> PyResult<()> {
|
|
24
|
+
python::gotoken(m)
|
|
25
|
+
}
|
|
26
|
+
'''
|
|
27
|
+
|
|
28
|
+
MARKER = "pub mod vocab;"
|
|
29
|
+
|
|
30
|
+
if "pub mod python;" in src:
|
|
31
|
+
print("Binding Python già presenti in lib.rs — nessuna modifica necessaria.")
|
|
32
|
+
else:
|
|
33
|
+
if MARKER not in src:
|
|
34
|
+
print(f"ERRORE: marker '{MARKER}' non trovato in lib.rs")
|
|
35
|
+
raise SystemExit(1)
|
|
36
|
+
src = src.replace(MARKER, MARKER + BLOCK, 1)
|
|
37
|
+
with open(path, "w") as f:
|
|
38
|
+
f.write(src)
|
|
39
|
+
print("✓ Binding Python aggiunti a lib.rs")
|
|
40
|
+
|
|
41
|
+
print("Ora esegui:")
|
|
42
|
+
print(" pip install maturin")
|
|
43
|
+
print(" maturin develop --features python,rayon")
|
|
44
|
+
print(" python3 test_gotoken.py")
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["maturin>=1.5,<2.0"]
|
|
3
|
+
build-backend = "maturin"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gotoken"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Syntax-aware Bash tokenizer — Rust core, Python bindings"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Programming Language :: Rust",
|
|
13
|
+
"Programming Language :: Python :: Implementation :: CPython",
|
|
14
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[tool.maturin]
|
|
18
|
+
# Attiva la feature "python" di Cargo che include pyo3
|
|
19
|
+
features = ["python", "rayon"]
|
|
20
|
+
# Funzione di entry-point del modulo PyO3 (deve matchare #[pymodule] fn gotoken)
|
|
21
|
+
module-name = "gotoken"
|
|
22
|
+
# Cartella sorgente Rust
|
|
23
|
+
manifest-path = "Cargo.toml"
|