@lokascript/semantic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +686 -0
  3. package/dist/browser-ar.ar.global.js +2 -0
  4. package/dist/browser-core.core.global.js +2 -0
  5. package/dist/browser-de.de.global.js +2 -0
  6. package/dist/browser-east-asian.east-asian.global.js +2 -0
  7. package/dist/browser-en-tr.en-tr.global.js +2 -0
  8. package/dist/browser-en.en.global.js +2 -0
  9. package/dist/browser-es-en.es-en.global.js +2 -0
  10. package/dist/browser-es.es.global.js +2 -0
  11. package/dist/browser-fr.fr.global.js +2 -0
  12. package/dist/browser-id.id.global.js +2 -0
  13. package/dist/browser-ja.ja.global.js +2 -0
  14. package/dist/browser-ko.ko.global.js +2 -0
  15. package/dist/browser-lazy.lazy.global.js +2 -0
  16. package/dist/browser-priority.priority.global.js +2 -0
  17. package/dist/browser-pt.pt.global.js +2 -0
  18. package/dist/browser-qu.qu.global.js +2 -0
  19. package/dist/browser-sw.sw.global.js +2 -0
  20. package/dist/browser-tr.tr.global.js +2 -0
  21. package/dist/browser-western.western.global.js +2 -0
  22. package/dist/browser-zh.zh.global.js +2 -0
  23. package/dist/browser.global.js +3 -0
  24. package/dist/browser.global.js.map +1 -0
  25. package/dist/index.cjs +35051 -0
  26. package/dist/index.cjs.map +1 -0
  27. package/dist/index.d.cts +3426 -0
  28. package/dist/index.d.ts +3426 -0
  29. package/dist/index.js +34890 -0
  30. package/dist/index.js.map +1 -0
  31. package/dist/languages/ar.d.ts +78 -0
  32. package/dist/languages/ar.js +1622 -0
  33. package/dist/languages/ar.js.map +1 -0
  34. package/dist/languages/de.d.ts +38 -0
  35. package/dist/languages/de.js +1168 -0
  36. package/dist/languages/de.js.map +1 -0
  37. package/dist/languages/en.d.ts +44 -0
  38. package/dist/languages/en.js +3491 -0
  39. package/dist/languages/en.js.map +1 -0
  40. package/dist/languages/es.d.ts +52 -0
  41. package/dist/languages/es.js +1493 -0
  42. package/dist/languages/es.js.map +1 -0
  43. package/dist/languages/fr.d.ts +37 -0
  44. package/dist/languages/fr.js +1159 -0
  45. package/dist/languages/fr.js.map +1 -0
  46. package/dist/languages/id.d.ts +35 -0
  47. package/dist/languages/id.js +1152 -0
  48. package/dist/languages/id.js.map +1 -0
  49. package/dist/languages/ja.d.ts +53 -0
  50. package/dist/languages/ja.js +1430 -0
  51. package/dist/languages/ja.js.map +1 -0
  52. package/dist/languages/ko.d.ts +51 -0
  53. package/dist/languages/ko.js +1729 -0
  54. package/dist/languages/ko.js.map +1 -0
  55. package/dist/languages/pt.d.ts +37 -0
  56. package/dist/languages/pt.js +1127 -0
  57. package/dist/languages/pt.js.map +1 -0
  58. package/dist/languages/qu.d.ts +36 -0
  59. package/dist/languages/qu.js +1143 -0
  60. package/dist/languages/qu.js.map +1 -0
  61. package/dist/languages/sw.d.ts +35 -0
  62. package/dist/languages/sw.js +1147 -0
  63. package/dist/languages/sw.js.map +1 -0
  64. package/dist/languages/tr.d.ts +45 -0
  65. package/dist/languages/tr.js +1529 -0
  66. package/dist/languages/tr.js.map +1 -0
  67. package/dist/languages/zh.d.ts +58 -0
  68. package/dist/languages/zh.js +1257 -0
  69. package/dist/languages/zh.js.map +1 -0
  70. package/dist/types-C4dcj53L.d.ts +600 -0
  71. package/package.json +202 -0
  72. package/src/__test-utils__/index.ts +7 -0
  73. package/src/__test-utils__/test-helpers.ts +8 -0
  74. package/src/__types__/test-helpers.ts +122 -0
  75. package/src/analysis/index.ts +479 -0
  76. package/src/ast-builder/command-mappers.ts +1133 -0
  77. package/src/ast-builder/expression-parser/index.ts +41 -0
  78. package/src/ast-builder/expression-parser/parser.ts +563 -0
  79. package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
  80. package/src/ast-builder/expression-parser/types.ts +208 -0
  81. package/src/ast-builder/index.ts +536 -0
  82. package/src/ast-builder/value-converters.ts +172 -0
  83. package/src/bridge.ts +275 -0
  84. package/src/browser-ar.ts +162 -0
  85. package/src/browser-core.ts +231 -0
  86. package/src/browser-de.ts +162 -0
  87. package/src/browser-east-asian.ts +173 -0
  88. package/src/browser-en-tr.ts +165 -0
  89. package/src/browser-en.ts +157 -0
  90. package/src/browser-es-en.ts +200 -0
  91. package/src/browser-es.ts +170 -0
  92. package/src/browser-fr.ts +162 -0
  93. package/src/browser-id.ts +162 -0
  94. package/src/browser-ja.ts +162 -0
  95. package/src/browser-ko.ts +162 -0
  96. package/src/browser-lazy.ts +189 -0
  97. package/src/browser-priority.ts +214 -0
  98. package/src/browser-pt.ts +162 -0
  99. package/src/browser-qu.ts +162 -0
  100. package/src/browser-sw.ts +162 -0
  101. package/src/browser-tr.ts +162 -0
  102. package/src/browser-western.ts +181 -0
  103. package/src/browser-zh.ts +162 -0
  104. package/src/browser.ts +268 -0
  105. package/src/cache/index.ts +14 -0
  106. package/src/cache/semantic-cache.ts +344 -0
  107. package/src/core-bridge.ts +372 -0
  108. package/src/explicit/converter.ts +258 -0
  109. package/src/explicit/index.ts +18 -0
  110. package/src/explicit/parser.ts +236 -0
  111. package/src/explicit/renderer.ts +424 -0
  112. package/src/generators/command-schemas.ts +1636 -0
  113. package/src/generators/event-handler-generator.ts +109 -0
  114. package/src/generators/index.ts +117 -0
  115. package/src/generators/language-profiles.ts +139 -0
  116. package/src/generators/pattern-generator.ts +537 -0
  117. package/src/generators/profiles/arabic.ts +131 -0
  118. package/src/generators/profiles/bengali.ts +132 -0
  119. package/src/generators/profiles/chinese.ts +124 -0
  120. package/src/generators/profiles/english.ts +113 -0
  121. package/src/generators/profiles/french.ts +125 -0
  122. package/src/generators/profiles/german.ts +126 -0
  123. package/src/generators/profiles/hindi.ts +146 -0
  124. package/src/generators/profiles/index.ts +46 -0
  125. package/src/generators/profiles/indonesian.ts +125 -0
  126. package/src/generators/profiles/italian.ts +139 -0
  127. package/src/generators/profiles/japanese.ts +149 -0
  128. package/src/generators/profiles/korean.ts +127 -0
  129. package/src/generators/profiles/marker-templates.ts +288 -0
  130. package/src/generators/profiles/ms.ts +130 -0
  131. package/src/generators/profiles/polish.ts +249 -0
  132. package/src/generators/profiles/portuguese.ts +115 -0
  133. package/src/generators/profiles/quechua.ts +113 -0
  134. package/src/generators/profiles/russian.ts +260 -0
  135. package/src/generators/profiles/spanish.ts +130 -0
  136. package/src/generators/profiles/swahili.ts +129 -0
  137. package/src/generators/profiles/thai.ts +132 -0
  138. package/src/generators/profiles/tl.ts +128 -0
  139. package/src/generators/profiles/turkish.ts +124 -0
  140. package/src/generators/profiles/types.ts +165 -0
  141. package/src/generators/profiles/ukrainian.ts +270 -0
  142. package/src/generators/profiles/vietnamese.ts +133 -0
  143. package/src/generators/schema-error-codes.ts +160 -0
  144. package/src/generators/schema-validator.ts +391 -0
  145. package/src/index.ts +429 -0
  146. package/src/language-building-schema.ts +3170 -0
  147. package/src/language-loader.ts +394 -0
  148. package/src/languages/_all.ts +65 -0
  149. package/src/languages/ar.ts +15 -0
  150. package/src/languages/bn.ts +16 -0
  151. package/src/languages/de.ts +15 -0
  152. package/src/languages/en.ts +29 -0
  153. package/src/languages/es.ts +15 -0
  154. package/src/languages/fr.ts +15 -0
  155. package/src/languages/hi.ts +26 -0
  156. package/src/languages/id.ts +15 -0
  157. package/src/languages/index.ts +18 -0
  158. package/src/languages/it.ts +15 -0
  159. package/src/languages/ja.ts +15 -0
  160. package/src/languages/ko.ts +15 -0
  161. package/src/languages/ms.ts +16 -0
  162. package/src/languages/pl.ts +18 -0
  163. package/src/languages/pt.ts +15 -0
  164. package/src/languages/qu.ts +15 -0
  165. package/src/languages/ru.ts +26 -0
  166. package/src/languages/sw.ts +15 -0
  167. package/src/languages/th.ts +16 -0
  168. package/src/languages/tl.ts +16 -0
  169. package/src/languages/tr.ts +15 -0
  170. package/src/languages/uk.ts +26 -0
  171. package/src/languages/vi.ts +16 -0
  172. package/src/languages/zh.ts +15 -0
  173. package/src/parser/index.ts +15 -0
  174. package/src/parser/pattern-matcher.ts +1181 -0
  175. package/src/parser/semantic-parser.ts +573 -0
  176. package/src/parser/utils/index.ts +35 -0
  177. package/src/parser/utils/marker-resolution.ts +111 -0
  178. package/src/parser/utils/possessive-keywords.ts +43 -0
  179. package/src/parser/utils/role-positioning.ts +70 -0
  180. package/src/parser/utils/type-validation.ts +134 -0
  181. package/src/patterns/add/ar.ts +71 -0
  182. package/src/patterns/add/bn.ts +70 -0
  183. package/src/patterns/add/hi.ts +69 -0
  184. package/src/patterns/add/index.ts +87 -0
  185. package/src/patterns/add/it.ts +61 -0
  186. package/src/patterns/add/ja.ts +93 -0
  187. package/src/patterns/add/ko.ts +74 -0
  188. package/src/patterns/add/ms.ts +30 -0
  189. package/src/patterns/add/pl.ts +62 -0
  190. package/src/patterns/add/ru.ts +62 -0
  191. package/src/patterns/add/th.ts +49 -0
  192. package/src/patterns/add/tl.ts +30 -0
  193. package/src/patterns/add/tr.ts +71 -0
  194. package/src/patterns/add/uk.ts +62 -0
  195. package/src/patterns/add/vi.ts +61 -0
  196. package/src/patterns/add/zh.ts +71 -0
  197. package/src/patterns/builders.ts +207 -0
  198. package/src/patterns/decrement/bn.ts +70 -0
  199. package/src/patterns/decrement/de.ts +42 -0
  200. package/src/patterns/decrement/hi.ts +68 -0
  201. package/src/patterns/decrement/index.ts +79 -0
  202. package/src/patterns/decrement/it.ts +69 -0
  203. package/src/patterns/decrement/ms.ts +30 -0
  204. package/src/patterns/decrement/pl.ts +58 -0
  205. package/src/patterns/decrement/ru.ts +58 -0
  206. package/src/patterns/decrement/th.ts +49 -0
  207. package/src/patterns/decrement/tl.ts +30 -0
  208. package/src/patterns/decrement/tr.ts +48 -0
  209. package/src/patterns/decrement/uk.ts +58 -0
  210. package/src/patterns/decrement/vi.ts +61 -0
  211. package/src/patterns/decrement/zh.ts +32 -0
  212. package/src/patterns/en.ts +302 -0
  213. package/src/patterns/event-handler/ar.ts +151 -0
  214. package/src/patterns/event-handler/bn.ts +72 -0
  215. package/src/patterns/event-handler/de.ts +117 -0
  216. package/src/patterns/event-handler/en.ts +117 -0
  217. package/src/patterns/event-handler/es.ts +136 -0
  218. package/src/patterns/event-handler/fr.ts +117 -0
  219. package/src/patterns/event-handler/hi.ts +64 -0
  220. package/src/patterns/event-handler/id.ts +117 -0
  221. package/src/patterns/event-handler/index.ts +119 -0
  222. package/src/patterns/event-handler/it.ts +54 -0
  223. package/src/patterns/event-handler/ja.ts +118 -0
  224. package/src/patterns/event-handler/ko.ts +133 -0
  225. package/src/patterns/event-handler/ms.ts +30 -0
  226. package/src/patterns/event-handler/pl.ts +62 -0
  227. package/src/patterns/event-handler/pt.ts +117 -0
  228. package/src/patterns/event-handler/qu.ts +66 -0
  229. package/src/patterns/event-handler/ru.ts +62 -0
  230. package/src/patterns/event-handler/shared.ts +270 -0
  231. package/src/patterns/event-handler/sw.ts +117 -0
  232. package/src/patterns/event-handler/th.ts +53 -0
  233. package/src/patterns/event-handler/tl.ts +30 -0
  234. package/src/patterns/event-handler/tr.ts +170 -0
  235. package/src/patterns/event-handler/uk.ts +62 -0
  236. package/src/patterns/event-handler/vi.ts +61 -0
  237. package/src/patterns/event-handler/zh.ts +150 -0
  238. package/src/patterns/get/ar.ts +49 -0
  239. package/src/patterns/get/bn.ts +47 -0
  240. package/src/patterns/get/de.ts +32 -0
  241. package/src/patterns/get/hi.ts +52 -0
  242. package/src/patterns/get/index.ts +83 -0
  243. package/src/patterns/get/it.ts +56 -0
  244. package/src/patterns/get/ja.ts +53 -0
  245. package/src/patterns/get/ko.ts +53 -0
  246. package/src/patterns/get/ms.ts +30 -0
  247. package/src/patterns/get/pl.ts +57 -0
  248. package/src/patterns/get/ru.ts +57 -0
  249. package/src/patterns/get/th.ts +29 -0
  250. package/src/patterns/get/tl.ts +30 -0
  251. package/src/patterns/get/uk.ts +57 -0
  252. package/src/patterns/get/vi.ts +48 -0
  253. package/src/patterns/grammar-transformed/index.ts +39 -0
  254. package/src/patterns/grammar-transformed/ja.ts +1713 -0
  255. package/src/patterns/grammar-transformed/ko.ts +1311 -0
  256. package/src/patterns/grammar-transformed/tr.ts +1067 -0
  257. package/src/patterns/hide/ar.ts +67 -0
  258. package/src/patterns/hide/bn.ts +47 -0
  259. package/src/patterns/hide/de.ts +36 -0
  260. package/src/patterns/hide/hi.ts +61 -0
  261. package/src/patterns/hide/index.ts +91 -0
  262. package/src/patterns/hide/it.ts +56 -0
  263. package/src/patterns/hide/ja.ts +69 -0
  264. package/src/patterns/hide/ko.ts +69 -0
  265. package/src/patterns/hide/ms.ts +30 -0
  266. package/src/patterns/hide/pl.ts +57 -0
  267. package/src/patterns/hide/ru.ts +57 -0
  268. package/src/patterns/hide/th.ts +29 -0
  269. package/src/patterns/hide/tl.ts +30 -0
  270. package/src/patterns/hide/tr.ts +65 -0
  271. package/src/patterns/hide/uk.ts +57 -0
  272. package/src/patterns/hide/vi.ts +56 -0
  273. package/src/patterns/hide/zh.ts +68 -0
  274. package/src/patterns/increment/bn.ts +70 -0
  275. package/src/patterns/increment/de.ts +36 -0
  276. package/src/patterns/increment/hi.ts +68 -0
  277. package/src/patterns/increment/index.ts +79 -0
  278. package/src/patterns/increment/it.ts +69 -0
  279. package/src/patterns/increment/ms.ts +30 -0
  280. package/src/patterns/increment/pl.ts +58 -0
  281. package/src/patterns/increment/ru.ts +58 -0
  282. package/src/patterns/increment/th.ts +49 -0
  283. package/src/patterns/increment/tl.ts +30 -0
  284. package/src/patterns/increment/tr.ts +52 -0
  285. package/src/patterns/increment/uk.ts +58 -0
  286. package/src/patterns/increment/vi.ts +61 -0
  287. package/src/patterns/increment/zh.ts +32 -0
  288. package/src/patterns/index.ts +84 -0
  289. package/src/patterns/languages/en/control-flow.ts +93 -0
  290. package/src/patterns/languages/en/fetch.ts +62 -0
  291. package/src/patterns/languages/en/index.ts +42 -0
  292. package/src/patterns/languages/en/repeat.ts +67 -0
  293. package/src/patterns/languages/en/set.ts +48 -0
  294. package/src/patterns/languages/en/swap.ts +38 -0
  295. package/src/patterns/languages/en/temporal.ts +57 -0
  296. package/src/patterns/put/ar.ts +74 -0
  297. package/src/patterns/put/bn.ts +53 -0
  298. package/src/patterns/put/en.ts +74 -0
  299. package/src/patterns/put/es.ts +74 -0
  300. package/src/patterns/put/hi.ts +69 -0
  301. package/src/patterns/put/id.ts +96 -0
  302. package/src/patterns/put/index.ts +99 -0
  303. package/src/patterns/put/it.ts +56 -0
  304. package/src/patterns/put/ja.ts +75 -0
  305. package/src/patterns/put/ko.ts +67 -0
  306. package/src/patterns/put/ms.ts +30 -0
  307. package/src/patterns/put/pl.ts +81 -0
  308. package/src/patterns/put/ru.ts +85 -0
  309. package/src/patterns/put/th.ts +32 -0
  310. package/src/patterns/put/tl.ts +30 -0
  311. package/src/patterns/put/tr.ts +67 -0
  312. package/src/patterns/put/uk.ts +85 -0
  313. package/src/patterns/put/vi.ts +72 -0
  314. package/src/patterns/put/zh.ts +62 -0
  315. package/src/patterns/registry.ts +163 -0
  316. package/src/patterns/remove/ar.ts +71 -0
  317. package/src/patterns/remove/bn.ts +68 -0
  318. package/src/patterns/remove/hi.ts +69 -0
  319. package/src/patterns/remove/index.ts +87 -0
  320. package/src/patterns/remove/it.ts +69 -0
  321. package/src/patterns/remove/ja.ts +74 -0
  322. package/src/patterns/remove/ko.ts +78 -0
  323. package/src/patterns/remove/ms.ts +30 -0
  324. package/src/patterns/remove/pl.ts +62 -0
  325. package/src/patterns/remove/ru.ts +62 -0
  326. package/src/patterns/remove/th.ts +49 -0
  327. package/src/patterns/remove/tl.ts +30 -0
  328. package/src/patterns/remove/tr.ts +78 -0
  329. package/src/patterns/remove/uk.ts +62 -0
  330. package/src/patterns/remove/vi.ts +61 -0
  331. package/src/patterns/remove/zh.ts +72 -0
  332. package/src/patterns/set/ar.ts +84 -0
  333. package/src/patterns/set/bn.ts +53 -0
  334. package/src/patterns/set/de.ts +84 -0
  335. package/src/patterns/set/es.ts +92 -0
  336. package/src/patterns/set/fr.ts +88 -0
  337. package/src/patterns/set/hi.ts +56 -0
  338. package/src/patterns/set/id.ts +84 -0
  339. package/src/patterns/set/index.ts +107 -0
  340. package/src/patterns/set/it.ts +56 -0
  341. package/src/patterns/set/ja.ts +86 -0
  342. package/src/patterns/set/ko.ts +85 -0
  343. package/src/patterns/set/ms.ts +30 -0
  344. package/src/patterns/set/pl.ts +57 -0
  345. package/src/patterns/set/pt.ts +84 -0
  346. package/src/patterns/set/ru.ts +57 -0
  347. package/src/patterns/set/th.ts +31 -0
  348. package/src/patterns/set/tl.ts +30 -0
  349. package/src/patterns/set/tr.ts +107 -0
  350. package/src/patterns/set/uk.ts +57 -0
  351. package/src/patterns/set/vi.ts +53 -0
  352. package/src/patterns/set/zh.ts +84 -0
  353. package/src/patterns/show/ar.ts +67 -0
  354. package/src/patterns/show/bn.ts +47 -0
  355. package/src/patterns/show/de.ts +32 -0
  356. package/src/patterns/show/fr.ts +32 -0
  357. package/src/patterns/show/hi.ts +61 -0
  358. package/src/patterns/show/index.ts +95 -0
  359. package/src/patterns/show/it.ts +56 -0
  360. package/src/patterns/show/ja.ts +69 -0
  361. package/src/patterns/show/ko.ts +73 -0
  362. package/src/patterns/show/ms.ts +30 -0
  363. package/src/patterns/show/pl.ts +57 -0
  364. package/src/patterns/show/ru.ts +57 -0
  365. package/src/patterns/show/th.ts +29 -0
  366. package/src/patterns/show/tl.ts +30 -0
  367. package/src/patterns/show/tr.ts +65 -0
  368. package/src/patterns/show/uk.ts +57 -0
  369. package/src/patterns/show/vi.ts +56 -0
  370. package/src/patterns/show/zh.ts +68 -0
  371. package/src/patterns/take/ar.ts +51 -0
  372. package/src/patterns/take/index.ts +31 -0
  373. package/src/patterns/toggle/ar.ts +61 -0
  374. package/src/patterns/toggle/bn.ts +70 -0
  375. package/src/patterns/toggle/en.ts +61 -0
  376. package/src/patterns/toggle/es.ts +61 -0
  377. package/src/patterns/toggle/hi.ts +80 -0
  378. package/src/patterns/toggle/index.ts +95 -0
  379. package/src/patterns/toggle/it.ts +69 -0
  380. package/src/patterns/toggle/ja.ts +156 -0
  381. package/src/patterns/toggle/ko.ts +113 -0
  382. package/src/patterns/toggle/ms.ts +30 -0
  383. package/src/patterns/toggle/pl.ts +62 -0
  384. package/src/patterns/toggle/ru.ts +62 -0
  385. package/src/patterns/toggle/th.ts +50 -0
  386. package/src/patterns/toggle/tl.ts +30 -0
  387. package/src/patterns/toggle/tr.ts +88 -0
  388. package/src/patterns/toggle/uk.ts +62 -0
  389. package/src/patterns/toggle/vi.ts +61 -0
  390. package/src/patterns/toggle/zh.ts +99 -0
  391. package/src/public-api.ts +286 -0
  392. package/src/registry.ts +441 -0
  393. package/src/tokenizers/arabic.ts +723 -0
  394. package/src/tokenizers/base.ts +1300 -0
  395. package/src/tokenizers/bengali.ts +289 -0
  396. package/src/tokenizers/chinese.ts +481 -0
  397. package/src/tokenizers/english.ts +416 -0
  398. package/src/tokenizers/french.ts +326 -0
  399. package/src/tokenizers/german.ts +324 -0
  400. package/src/tokenizers/hindi.ts +319 -0
  401. package/src/tokenizers/index.ts +127 -0
  402. package/src/tokenizers/indonesian.ts +306 -0
  403. package/src/tokenizers/italian.ts +458 -0
  404. package/src/tokenizers/japanese.ts +447 -0
  405. package/src/tokenizers/korean.ts +642 -0
  406. package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
  407. package/src/tokenizers/morphology/french-normalizer.ts +268 -0
  408. package/src/tokenizers/morphology/german-normalizer.ts +256 -0
  409. package/src/tokenizers/morphology/index.ts +46 -0
  410. package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
  411. package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
  412. package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
  413. package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
  414. package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
  415. package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
  416. package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
  417. package/src/tokenizers/morphology/types.ts +211 -0
  418. package/src/tokenizers/ms.ts +198 -0
  419. package/src/tokenizers/polish.ts +354 -0
  420. package/src/tokenizers/portuguese.ts +304 -0
  421. package/src/tokenizers/quechua.ts +339 -0
  422. package/src/tokenizers/russian.ts +375 -0
  423. package/src/tokenizers/spanish.ts +403 -0
  424. package/src/tokenizers/swahili.ts +303 -0
  425. package/src/tokenizers/thai.ts +236 -0
  426. package/src/tokenizers/tl.ts +198 -0
  427. package/src/tokenizers/turkish.ts +411 -0
  428. package/src/tokenizers/ukrainian.ts +369 -0
  429. package/src/tokenizers/vietnamese.ts +410 -0
  430. package/src/types/grammar-types.ts +617 -0
  431. package/src/types/unified-profile.ts +267 -0
  432. package/src/types.ts +709 -0
  433. package/src/utils/confidence-calculator.ts +147 -0
  434. package/src/validators/command-validator.ts +380 -0
  435. package/src/validators/index.ts +15 -0
@@ -0,0 +1,1300 @@
1
+ /**
2
+ * Base Tokenizer
3
+ *
4
+ * Provides the TokenStream implementation and shared tokenization utilities.
5
+ * Language-specific tokenizers extend these base utilities.
6
+ */
7
+
8
+ import type {
9
+ LanguageToken,
10
+ TokenKind,
11
+ TokenStream,
12
+ StreamMark,
13
+ SourcePosition,
14
+ LanguageTokenizer,
15
+ } from '../types';
16
+ import type { MorphologicalNormalizer, NormalizationResult } from './morphology/types';
17
+
18
+ // =============================================================================
19
+ // Time Unit Configuration
20
+ // =============================================================================
21
+
22
+ /**
23
+ * Configuration for a native language time unit pattern.
24
+ * Used by tryNumberWithTimeUnits() to match language-specific time units.
25
+ */
26
+ export interface TimeUnitMapping {
27
+ /** The pattern to match (e.g., 'segundos', 'ミリ秒') */
28
+ readonly pattern: string;
29
+ /** The standard suffix to use (ms, s, m, h) */
30
+ readonly suffix: string;
31
+ /** Length of the pattern (for optimization) */
32
+ readonly length: number;
33
+ /** Whether to check for word boundary after the pattern */
34
+ readonly checkBoundary?: boolean;
35
+ /** Character that cannot follow the pattern (e.g., 's' for 'm' to avoid 'ms') */
36
+ readonly notFollowedBy?: string;
37
+ /** Whether to do case-insensitive matching */
38
+ readonly caseInsensitive?: boolean;
39
+ }
40
+
41
+ // =============================================================================
42
+ // Token Stream Implementation
43
+ // =============================================================================
44
+
45
+ /**
46
+ * Concrete implementation of TokenStream.
47
+ */
48
+ export class TokenStreamImpl implements TokenStream {
49
+ readonly tokens: readonly LanguageToken[];
50
+ readonly language: string;
51
+ private pos: number = 0;
52
+
53
+ constructor(tokens: LanguageToken[], language: string) {
54
+ this.tokens = tokens;
55
+ this.language = language;
56
+ }
57
+
58
+ peek(offset: number = 0): LanguageToken | null {
59
+ const index = this.pos + offset;
60
+ if (index < 0 || index >= this.tokens.length) {
61
+ return null;
62
+ }
63
+ return this.tokens[index];
64
+ }
65
+
66
+ advance(): LanguageToken {
67
+ if (this.isAtEnd()) {
68
+ throw new Error('Unexpected end of token stream');
69
+ }
70
+ return this.tokens[this.pos++];
71
+ }
72
+
73
+ isAtEnd(): boolean {
74
+ return this.pos >= this.tokens.length;
75
+ }
76
+
77
+ mark(): StreamMark {
78
+ return { position: this.pos };
79
+ }
80
+
81
+ reset(mark: StreamMark): void {
82
+ this.pos = mark.position;
83
+ }
84
+
85
+ position(): number {
86
+ return this.pos;
87
+ }
88
+
89
+ /**
90
+ * Get remaining tokens as an array.
91
+ */
92
+ remaining(): LanguageToken[] {
93
+ return this.tokens.slice(this.pos);
94
+ }
95
+
96
+ /**
97
+ * Consume tokens while predicate is true.
98
+ */
99
+ takeWhile(predicate: (token: LanguageToken) => boolean): LanguageToken[] {
100
+ const result: LanguageToken[] = [];
101
+ while (!this.isAtEnd() && predicate(this.peek()!)) {
102
+ result.push(this.advance());
103
+ }
104
+ return result;
105
+ }
106
+
107
+ /**
108
+ * Skip tokens while predicate is true.
109
+ */
110
+ skipWhile(predicate: (token: LanguageToken) => boolean): void {
111
+ while (!this.isAtEnd() && predicate(this.peek()!)) {
112
+ this.advance();
113
+ }
114
+ }
115
+ }
116
+
117
+ // =============================================================================
118
+ // Shared Tokenization Utilities
119
+ // =============================================================================
120
+
121
+ /**
122
+ * Create a source position from start and end offsets.
123
+ */
124
+ export function createPosition(start: number, end: number): SourcePosition {
125
+ return { start, end };
126
+ }
127
+
128
+ /**
129
+ * Options for creating a token with optional morphological data.
130
+ */
131
+ export interface CreateTokenOptions {
132
+ /** Explicitly normalized form from keyword map */
133
+ normalized?: string;
134
+ /** Morphologically normalized stem */
135
+ stem?: string;
136
+ /** Confidence in the stem (0.0-1.0) */
137
+ stemConfidence?: number;
138
+ }
139
+
140
+ /**
141
+ * Create a language token.
142
+ */
143
+ export function createToken(
144
+ value: string,
145
+ kind: TokenKind,
146
+ position: SourcePosition,
147
+ normalizedOrOptions?: string | CreateTokenOptions
148
+ ): LanguageToken {
149
+ // Handle legacy string argument for backward compatibility
150
+ if (typeof normalizedOrOptions === 'string') {
151
+ return { value, kind, position, normalized: normalizedOrOptions };
152
+ }
153
+
154
+ // Handle options object
155
+ if (normalizedOrOptions) {
156
+ const { normalized, stem, stemConfidence } = normalizedOrOptions;
157
+ const token: LanguageToken = { value, kind, position };
158
+
159
+ // Build token with only defined properties
160
+ if (normalized !== undefined) {
161
+ (token as any).normalized = normalized;
162
+ }
163
+ if (stem !== undefined) {
164
+ (token as any).stem = stem;
165
+ if (stemConfidence !== undefined) {
166
+ (token as any).stemConfidence = stemConfidence;
167
+ }
168
+ }
169
+
170
+ return token;
171
+ }
172
+
173
+ return { value, kind, position };
174
+ }
175
+
176
+ /**
177
+ * Check if a character is whitespace.
178
+ */
179
+ export function isWhitespace(char: string): boolean {
180
+ return /\s/.test(char);
181
+ }
182
+
183
+ /**
184
+ * Check if a string starts with a CSS selector prefix.
185
+ * Includes JSX-style element selectors: <form />, <div>
186
+ */
187
+ export function isSelectorStart(char: string): boolean {
188
+ return (
189
+ char === '#' || char === '.' || char === '[' || char === '@' || char === '*' || char === '<'
190
+ );
191
+ }
192
+
193
+ /**
194
+ * Check if a character is a quote (string delimiter).
195
+ */
196
+ export function isQuote(char: string): boolean {
197
+ return char === '"' || char === "'" || char === '`' || char === '「' || char === '」';
198
+ }
199
+
200
+ /**
201
+ * Check if a character is a digit.
202
+ */
203
+ export function isDigit(char: string): boolean {
204
+ return /\d/.test(char);
205
+ }
206
+
207
+ /**
208
+ * Check if a character is an ASCII letter.
209
+ */
210
+ export function isAsciiLetter(char: string): boolean {
211
+ return /[a-zA-Z]/.test(char);
212
+ }
213
+
214
+ /**
215
+ * Check if a character is part of an ASCII identifier.
216
+ */
217
+ export function isAsciiIdentifierChar(char: string): boolean {
218
+ return /[a-zA-Z0-9_-]/.test(char);
219
+ }
220
+
221
+ // =============================================================================
222
+ // Unicode Range Classification
223
+ // =============================================================================
224
+
225
+ /**
226
+ * Unicode range tuple: [start, end] (inclusive).
227
+ */
228
+ export type UnicodeRange = readonly [number, number];
229
+
230
+ /**
231
+ * Create a character classifier for Unicode ranges.
232
+ * Returns a function that checks if a character's code point falls within any of the ranges.
233
+ *
234
+ * @example
235
+ * // Japanese Hiragana
236
+ * const isHiragana = createUnicodeRangeClassifier([[0x3040, 0x309f]]);
237
+ *
238
+ * // Korean (Hangul syllables + Jamo)
239
+ * const isKorean = createUnicodeRangeClassifier([
240
+ * [0xac00, 0xd7a3], // Hangul syllables
241
+ * [0x1100, 0x11ff], // Hangul Jamo
242
+ * [0x3130, 0x318f], // Hangul Compatibility Jamo
243
+ * ]);
244
+ */
245
+ export function createUnicodeRangeClassifier(
246
+ ranges: readonly UnicodeRange[]
247
+ ): (char: string) => boolean {
248
+ return (char: string): boolean => {
249
+ const code = char.charCodeAt(0);
250
+ return ranges.some(([start, end]) => code >= start && code <= end);
251
+ };
252
+ }
253
+
254
+ /**
255
+ * Combine multiple character classifiers into one.
256
+ * Returns true if any of the classifiers return true.
257
+ *
258
+ * @example
259
+ * const isJapanese = combineClassifiers(isHiragana, isKatakana, isKanji);
260
+ */
261
+ export function combineClassifiers(
262
+ ...classifiers: Array<(char: string) => boolean>
263
+ ): (char: string) => boolean {
264
+ return (char: string): boolean => classifiers.some(fn => fn(char));
265
+ }
266
+
267
+ /**
268
+ * Character classifiers for a Latin-based language.
269
+ */
270
+ export interface LatinCharClassifiers {
271
+ /** Check if character is a letter in this language (including accented chars). */
272
+ isLetter: (char: string) => boolean;
273
+ /** Check if character is part of an identifier (letter, digit, underscore, hyphen). */
274
+ isIdentifierChar: (char: string) => boolean;
275
+ }
276
+
277
+ /**
278
+ * Create character classifiers for a Latin-based language.
279
+ * Returns isLetter and isIdentifierChar functions based on the provided regex.
280
+ *
281
+ * @example
282
+ * // Spanish letters
283
+ * const { isLetter, isIdentifierChar } = createLatinCharClassifiers(/[a-zA-ZáéíóúüñÁÉÍÓÚÜÑ]/);
284
+ *
285
+ * // German letters
286
+ * const { isLetter, isIdentifierChar } = createLatinCharClassifiers(/[a-zA-ZäöüÄÖÜß]/);
287
+ */
288
+ export function createLatinCharClassifiers(letterPattern: RegExp): LatinCharClassifiers {
289
+ const isLetter = (char: string): boolean => letterPattern.test(char);
290
+ const isIdentifierChar = (char: string): boolean => isLetter(char) || /[0-9_-]/.test(char);
291
+ return { isLetter, isIdentifierChar };
292
+ }
293
+
294
+ // =============================================================================
295
+ // CSS Selector Tokenization
296
+ // =============================================================================
297
+
298
+ /**
299
+ * Extract a CSS selector from the input string starting at pos.
300
+ * CSS selectors are universal across languages.
301
+ *
302
+ * Supported formats:
303
+ * - #id
304
+ * - .class
305
+ * - [attribute]
306
+ * - [attribute=value]
307
+ * - @attribute (shorthand)
308
+ * - *property (CSS property shorthand)
309
+ * - Complex selectors with combinators (limited)
310
+ *
311
+ * Method call handling:
312
+ * - #dialog.showModal() → stops after #dialog (method call, not compound selector)
313
+ * - #box.active → compound selector (no parens)
314
+ */
315
+ export function extractCssSelector(input: string, startPos: number): string | null {
316
+ if (startPos >= input.length) return null;
317
+
318
+ const char = input[startPos];
319
+ if (!isSelectorStart(char)) return null;
320
+
321
+ let pos = startPos;
322
+ let selector = '';
323
+
324
+ // Handle different selector types
325
+ if (char === '#' || char === '.') {
326
+ // ID or class selector: #id, .class
327
+ selector += input[pos++];
328
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
329
+ selector += input[pos++];
330
+ }
331
+ // Must have at least one character after prefix
332
+ if (selector.length <= 1) return null;
333
+
334
+ // Check for method call pattern: #id.method() or .class.method()
335
+ // If we see .identifier followed by (, don't consume it - it's a method call
336
+ if (pos < input.length && input[pos] === '.' && char === '#') {
337
+ // Look ahead to see if this is a method call
338
+ const methodStart = pos + 1;
339
+ let methodEnd = methodStart;
340
+ while (methodEnd < input.length && isAsciiIdentifierChar(input[methodEnd])) {
341
+ methodEnd++;
342
+ }
343
+ // If followed by (, it's a method call - stop here
344
+ if (methodEnd < input.length && input[methodEnd] === '(') {
345
+ return selector;
346
+ }
347
+ }
348
+ } else if (char === '[') {
349
+ // Attribute selector: [attr] or [attr=value] or [attr="value"]
350
+ // Need to track quote state to avoid counting brackets inside quotes
351
+ let depth = 1;
352
+ let inQuote = false;
353
+ let quoteChar: string | null = null;
354
+ let escaped = false;
355
+
356
+ selector += input[pos++]; // [
357
+
358
+ while (pos < input.length && depth > 0) {
359
+ const c = input[pos];
360
+ selector += c;
361
+
362
+ if (escaped) {
363
+ // Skip escaped character
364
+ escaped = false;
365
+ } else if (c === '\\') {
366
+ // Next character is escaped
367
+ escaped = true;
368
+ } else if (inQuote) {
369
+ // Inside a quoted string
370
+ if (c === quoteChar) {
371
+ inQuote = false;
372
+ quoteChar = null;
373
+ }
374
+ } else {
375
+ // Not inside a quoted string
376
+ if (c === '"' || c === "'" || c === '`') {
377
+ inQuote = true;
378
+ quoteChar = c;
379
+ } else if (c === '[') {
380
+ depth++;
381
+ } else if (c === ']') {
382
+ depth--;
383
+ }
384
+ }
385
+ pos++;
386
+ }
387
+ if (depth !== 0) return null;
388
+ } else if (char === '@') {
389
+ // Attribute shorthand: @disabled
390
+ selector += input[pos++];
391
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
392
+ selector += input[pos++];
393
+ }
394
+ if (selector.length <= 1) return null;
395
+ } else if (char === '*') {
396
+ // CSS property shorthand: *display
397
+ selector += input[pos++];
398
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
399
+ selector += input[pos++];
400
+ }
401
+ if (selector.length <= 1) return null;
402
+ } else if (char === '<') {
403
+ // HTML literal selector with optional modifiers and attributes:
404
+ // - <div>
405
+ // - <div.class>
406
+ // - <div#id>
407
+ // - <div.class#id>
408
+ // - <button[disabled]/>
409
+ // - <div.card/>
410
+ // - <div.class#id[attr="value"]/>
411
+ selector += input[pos++]; // <
412
+
413
+ // Must be followed by an identifier (tag name)
414
+ if (pos >= input.length || !isAsciiLetter(input[pos])) return null;
415
+
416
+ // Extract tag name
417
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
418
+ selector += input[pos++];
419
+ }
420
+
421
+ // Process modifiers and attributes
422
+ // Can have multiple .class, one #id, and multiple [attr] in any order
423
+ while (pos < input.length) {
424
+ const modChar = input[pos];
425
+
426
+ if (modChar === '.') {
427
+ // Class modifier
428
+ selector += input[pos++]; // .
429
+ if (pos >= input.length || !isAsciiIdentifierChar(input[pos])) {
430
+ return null; // Invalid - class name required after .
431
+ }
432
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
433
+ selector += input[pos++];
434
+ }
435
+ } else if (modChar === '#') {
436
+ // ID modifier
437
+ selector += input[pos++]; // #
438
+ if (pos >= input.length || !isAsciiIdentifierChar(input[pos])) {
439
+ return null; // Invalid - ID required after #
440
+ }
441
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
442
+ selector += input[pos++];
443
+ }
444
+ } else if (modChar === '[') {
445
+ // Attribute modifier: [disabled] or [type="button"]
446
+ // Need to track quote state to avoid counting brackets inside quotes
447
+ let depth = 1;
448
+ let inQuote = false;
449
+ let quoteChar: string | null = null;
450
+ let escaped = false;
451
+
452
+ selector += input[pos++]; // [
453
+
454
+ while (pos < input.length && depth > 0) {
455
+ const c = input[pos];
456
+ selector += c;
457
+
458
+ if (escaped) {
459
+ escaped = false;
460
+ } else if (c === '\\') {
461
+ escaped = true;
462
+ } else if (inQuote) {
463
+ if (c === quoteChar) {
464
+ inQuote = false;
465
+ quoteChar = null;
466
+ }
467
+ } else {
468
+ if (c === '"' || c === "'" || c === '`') {
469
+ inQuote = true;
470
+ quoteChar = c;
471
+ } else if (c === '[') {
472
+ depth++;
473
+ } else if (c === ']') {
474
+ depth--;
475
+ }
476
+ }
477
+ pos++;
478
+ }
479
+ if (depth !== 0) return null; // Unclosed bracket
480
+ } else {
481
+ // No more modifiers
482
+ break;
483
+ }
484
+ }
485
+
486
+ // Skip whitespace before optional self-closing /
487
+ while (pos < input.length && isWhitespace(input[pos])) {
488
+ selector += input[pos++];
489
+ }
490
+
491
+ // Optional self-closing /
492
+ if (pos < input.length && input[pos] === '/') {
493
+ selector += input[pos++];
494
+ // Skip whitespace after /
495
+ while (pos < input.length && isWhitespace(input[pos])) {
496
+ selector += input[pos++];
497
+ }
498
+ }
499
+
500
+ // Must end with >
501
+ if (pos >= input.length || input[pos] !== '>') return null;
502
+ selector += input[pos++]; // >
503
+ }
504
+
505
+ return selector || null;
506
+ }
507
+
508
+ // =============================================================================
509
+ // String Literal Tokenization
510
+ // =============================================================================
511
+
512
+ /**
513
+ * Check if a single quote at pos is a possessive marker ('s).
514
+ * Returns true if this looks like possessive, not a string start.
515
+ *
516
+ * Examples:
517
+ * - #element's *opacity → possessive (returns true)
518
+ * - 'hello' → string (returns false)
519
+ * - it's value → possessive (returns true)
520
+ */
521
+ export function isPossessiveMarker(input: string, pos: number): boolean {
522
+ if (pos >= input.length || input[pos] !== "'") return false;
523
+
524
+ // Check if followed by 's' or 'S'
525
+ if (pos + 1 >= input.length) return false;
526
+ const nextChar = input[pos + 1].toLowerCase();
527
+ if (nextChar !== 's') return false;
528
+
529
+ // After 's, should be end, whitespace, or special char (not alphanumeric)
530
+ if (pos + 2 >= input.length) return true; // end of input
531
+ const afterS = input[pos + 2];
532
+ return isWhitespace(afterS) || afterS === '*' || !isAsciiIdentifierChar(afterS);
533
+ }
534
+
535
+ /**
536
+ * Extract a string literal from the input starting at pos.
537
+ * Handles both ASCII quotes and Unicode quotes.
538
+ *
539
+ * Note: Single quotes that look like possessive markers ('s) are skipped.
540
+ */
541
+ export function extractStringLiteral(input: string, startPos: number): string | null {
542
+ if (startPos >= input.length) return null;
543
+
544
+ const openQuote = input[startPos];
545
+ if (!isQuote(openQuote)) return null;
546
+
547
+ // Check for possessive marker - don't treat as string
548
+ if (openQuote === "'" && isPossessiveMarker(input, startPos)) {
549
+ return null;
550
+ }
551
+
552
+ // Map opening quotes to closing quotes
553
+ const closeQuoteMap: Record<string, string> = {
554
+ '"': '"',
555
+ "'": "'",
556
+ '`': '`',
557
+ '「': '」',
558
+ };
559
+
560
+ const closeQuote = closeQuoteMap[openQuote];
561
+ if (!closeQuote) return null;
562
+
563
+ let pos = startPos + 1;
564
+ let literal = openQuote;
565
+ let escaped = false;
566
+
567
+ while (pos < input.length) {
568
+ const char = input[pos];
569
+ literal += char;
570
+
571
+ if (escaped) {
572
+ escaped = false;
573
+ } else if (char === '\\') {
574
+ escaped = true;
575
+ } else if (char === closeQuote) {
576
+ // Found closing quote
577
+ return literal;
578
+ }
579
+ pos++;
580
+ }
581
+
582
+ // Unclosed string - return what we have
583
+ return literal;
584
+ }
585
+
586
+ // =============================================================================
587
+ // URL Tokenization
588
+ // =============================================================================
589
+
590
+ /**
591
+ * Check if the input at position starts a URL.
592
+ * Detects: /path, ./path, ../path, //domain.com, http://, https://
593
+ */
594
+ export function isUrlStart(input: string, pos: number): boolean {
595
+ if (pos >= input.length) return false;
596
+
597
+ const char = input[pos];
598
+ const next = input[pos + 1] || '';
599
+ const third = input[pos + 2] || '';
600
+
601
+ // Absolute path: /something (but not just /)
602
+ // Must be followed by alphanumeric or path char, not another / (that's protocol-relative)
603
+ if (char === '/' && next !== '/' && /[a-zA-Z0-9._-]/.test(next)) {
604
+ return true;
605
+ }
606
+
607
+ // Protocol-relative: //domain.com
608
+ if (char === '/' && next === '/' && /[a-zA-Z]/.test(third)) {
609
+ return true;
610
+ }
611
+
612
+ // Relative path: ./ or ../
613
+ if (char === '.' && (next === '/' || (next === '.' && third === '/'))) {
614
+ return true;
615
+ }
616
+
617
+ // Full URL: http:// or https://
618
+ const slice = input.slice(pos, pos + 8).toLowerCase();
619
+ if (slice.startsWith('http://') || slice.startsWith('https://')) {
620
+ return true;
621
+ }
622
+
623
+ return false;
624
+ }
625
+
626
+ /**
627
+ * Extract a URL from the input starting at pos.
628
+ * Handles paths, query strings, and fragments.
629
+ *
630
+ * Fragment (#) handling:
631
+ * - /page#section → includes fragment as part of URL
632
+ * - #id alone → not a URL (CSS selector)
633
+ */
634
+ export function extractUrl(input: string, startPos: number): string | null {
635
+ if (!isUrlStart(input, startPos)) return null;
636
+
637
+ let pos = startPos;
638
+ let url = '';
639
+
640
+ // Core URL characters (RFC 3986 unreserved + sub-delims + path/query chars)
641
+ // Includes: letters, digits, and - . _ ~ : / ? # [ ] @ ! $ & ' ( ) * + , ; = %
642
+ const urlChars = /[a-zA-Z0-9/:._\-?&=%@+~!$'()*,;[\]]/;
643
+
644
+ while (pos < input.length) {
645
+ const char = input[pos];
646
+
647
+ // Special handling for #
648
+ if (char === '#') {
649
+ // Only include # if we have path content before it (it's a fragment)
650
+ // If # appears at URL start or after certain chars, stop (might be CSS selector)
651
+ if (url.length > 0 && /[a-zA-Z0-9/.]$/.test(url)) {
652
+ // Include fragment
653
+ url += char;
654
+ pos++;
655
+ // Consume fragment identifier (letters, digits, underscore, hyphen)
656
+ while (pos < input.length && /[a-zA-Z0-9_-]/.test(input[pos])) {
657
+ url += input[pos++];
658
+ }
659
+ }
660
+ // Stop either way - fragment consumed or # is separate token
661
+ break;
662
+ }
663
+
664
+ if (urlChars.test(char)) {
665
+ url += char;
666
+ pos++;
667
+ } else {
668
+ break;
669
+ }
670
+ }
671
+
672
+ // Minimum length validation
673
+ if (url.length < 2) return null;
674
+
675
+ return url;
676
+ }
677
+
678
+ // =============================================================================
679
+ // Number Tokenization
680
+ // =============================================================================
681
+
682
+ /**
683
+ * Extract a number from the input starting at pos.
684
+ * Handles integers and decimals.
685
+ */
686
+ export function extractNumber(input: string, startPos: number): string | null {
687
+ if (startPos >= input.length) return null;
688
+
689
+ const char = input[startPos];
690
+ if (!isDigit(char) && char !== '-' && char !== '+') return null;
691
+
692
+ let pos = startPos;
693
+ let number = '';
694
+
695
+ // Optional sign
696
+ if (input[pos] === '-' || input[pos] === '+') {
697
+ number += input[pos++];
698
+ }
699
+
700
+ // Must have at least one digit
701
+ if (pos >= input.length || !isDigit(input[pos])) {
702
+ return null;
703
+ }
704
+
705
+ // Integer part
706
+ while (pos < input.length && isDigit(input[pos])) {
707
+ number += input[pos++];
708
+ }
709
+
710
+ // Optional decimal part
711
+ if (pos < input.length && input[pos] === '.') {
712
+ number += input[pos++];
713
+ while (pos < input.length && isDigit(input[pos])) {
714
+ number += input[pos++];
715
+ }
716
+ }
717
+
718
+ // Optional duration suffix (s, ms, m, h)
719
+ if (pos < input.length) {
720
+ const suffix = input.slice(pos, pos + 2);
721
+ if (suffix === 'ms') {
722
+ number += 'ms';
723
+ } else if (input[pos] === 's' || input[pos] === 'm' || input[pos] === 'h') {
724
+ number += input[pos];
725
+ }
726
+ }
727
+
728
+ return number;
729
+ }
730
+
731
+ // =============================================================================
732
+ // Base Tokenizer Class
733
+ // =============================================================================
734
+
735
+ /**
736
+ * Keyword entry for tokenizer - maps native word to normalized English form.
737
+ */
738
+ export interface KeywordEntry {
739
+ readonly native: string;
740
+ readonly normalized: string;
741
+ }
742
+
743
+ /**
744
+ * Profile interface for keyword derivation.
745
+ * Matches the structure of LanguageProfile but only includes fields needed for tokenization.
746
+ */
747
+ export interface TokenizerProfile {
748
+ readonly keywords?: Record<
749
+ string,
750
+ { primary: string; alternatives?: string[]; normalized?: string }
751
+ >;
752
+ readonly references?: Record<string, string>;
753
+ readonly roleMarkers?: Record<
754
+ string,
755
+ { primary: string; alternatives?: string[]; position?: string }
756
+ >;
757
+ }
758
+
759
+ /**
760
+ * Abstract base class for language-specific tokenizers.
761
+ * Provides common functionality for CSS selectors, strings, and numbers.
762
+ */
763
+ export abstract class BaseTokenizer implements LanguageTokenizer {
764
+ abstract readonly language: string;
765
+ abstract readonly direction: 'ltr' | 'rtl';
766
+
767
+ /** Optional morphological normalizer for this language */
768
+ protected normalizer?: MorphologicalNormalizer;
769
+
770
+ /** Keywords derived from profile, sorted longest-first for greedy matching */
771
+ protected profileKeywords: KeywordEntry[] = [];
772
+
773
+ /** Map for O(1) keyword lookups by lowercase native word */
774
+ protected profileKeywordMap: Map<string, KeywordEntry> = new Map();
775
+
776
+ abstract tokenize(input: string): TokenStream;
777
+ abstract classifyToken(token: string): TokenKind;
778
+
779
+ /**
780
+ * Initialize keyword mappings from a language profile.
781
+ * Builds a list of native→english mappings from:
782
+ * - profile.keywords (primary + alternatives)
783
+ * - profile.references (me, it, you, etc.)
784
+ * - profile.roleMarkers (into, from, with, etc.)
785
+ *
786
+ * Results are sorted longest-first for greedy matching (important for non-space languages).
787
+ * Extras take precedence over profile entries when there are duplicates.
788
+ *
789
+ * @param profile - Language profile containing keyword translations
790
+ * @param extras - Additional keyword entries to include (literals, positional, events)
791
+ */
792
+ protected initializeKeywordsFromProfile(
793
+ profile: TokenizerProfile,
794
+ extras: KeywordEntry[] = []
795
+ ): void {
796
+ // Use a Map to deduplicate, with extras taking precedence
797
+ const keywordMap = new Map<string, KeywordEntry>();
798
+
799
+ // Extract from keywords (command translations)
800
+ if (profile.keywords) {
801
+ for (const [normalized, translation] of Object.entries(profile.keywords)) {
802
+ // Primary translation
803
+ keywordMap.set(translation.primary, {
804
+ native: translation.primary,
805
+ normalized: translation.normalized || normalized,
806
+ });
807
+
808
+ // Alternative forms
809
+ if (translation.alternatives) {
810
+ for (const alt of translation.alternatives) {
811
+ keywordMap.set(alt, {
812
+ native: alt,
813
+ normalized: translation.normalized || normalized,
814
+ });
815
+ }
816
+ }
817
+ }
818
+ }
819
+
820
+ // Extract from references (me, it, you, etc.)
821
+ if (profile.references) {
822
+ for (const [normalized, native] of Object.entries(profile.references)) {
823
+ keywordMap.set(native, { native, normalized });
824
+ }
825
+ }
826
+
827
+ // Extract from roleMarkers (into, from, with, etc.)
828
+ if (profile.roleMarkers) {
829
+ for (const [role, marker] of Object.entries(profile.roleMarkers)) {
830
+ if (marker.primary) {
831
+ keywordMap.set(marker.primary, { native: marker.primary, normalized: role });
832
+ }
833
+ if (marker.alternatives) {
834
+ for (const alt of marker.alternatives) {
835
+ keywordMap.set(alt, { native: alt, normalized: role });
836
+ }
837
+ }
838
+ }
839
+ }
840
+
841
+ // Add extra entries (literals, positional, events) - these OVERRIDE profile entries
842
+ for (const extra of extras) {
843
+ keywordMap.set(extra.native, extra);
844
+ }
845
+
846
+ // Convert to array and sort longest-first for greedy matching
847
+ this.profileKeywords = Array.from(keywordMap.values()).sort(
848
+ (a, b) => b.native.length - a.native.length
849
+ );
850
+
851
+ // Build Map for O(1) lookups (case-insensitive + diacritic-insensitive)
852
+ // This allows matching both 'بدّل' (with shadda) and 'بدل' (without) to the same entry
853
+ this.profileKeywordMap = new Map();
854
+ for (const keyword of this.profileKeywords) {
855
+ // Add original form (with diacritics if present)
856
+ this.profileKeywordMap.set(keyword.native.toLowerCase(), keyword);
857
+
858
+ // Add diacritic-normalized form (for Arabic, Turkish, etc.)
859
+ const normalized = this.removeDiacritics(keyword.native);
860
+ if (normalized !== keyword.native && !this.profileKeywordMap.has(normalized.toLowerCase())) {
861
+ this.profileKeywordMap.set(normalized.toLowerCase(), keyword);
862
+ }
863
+ }
864
+ }
865
+
866
+ /**
867
+ * Remove diacritical marks from a word for normalization.
868
+ * Primarily for Arabic (shadda, fatha, kasra, damma, sukun, etc.)
869
+ * but could be extended for other languages.
870
+ *
871
+ * @param word - Word to normalize
872
+ * @returns Word without diacritics
873
+ */
874
+ protected removeDiacritics(word: string): string {
875
+ // Arabic diacritics: U+064B-U+0652 (fatha, kasra, damma, sukun, shadda, etc.)
876
+ // U+0670 (superscript alif)
877
+ return word.replace(/[\u064B-\u0652\u0670]/g, '');
878
+ }
879
+
880
+ /**
881
+ * Try to match a keyword from profile at the current position.
882
+ * Uses longest-first greedy matching (important for non-space languages).
883
+ *
884
+ * @param input - Input string
885
+ * @param pos - Current position
886
+ * @returns Token if matched, null otherwise
887
+ */
888
+ protected tryProfileKeyword(input: string, pos: number): LanguageToken | null {
889
+ for (const entry of this.profileKeywords) {
890
+ if (input.slice(pos).startsWith(entry.native)) {
891
+ return createToken(
892
+ entry.native,
893
+ 'keyword',
894
+ createPosition(pos, pos + entry.native.length),
895
+ entry.normalized
896
+ );
897
+ }
898
+ }
899
+ return null;
900
+ }
901
+
902
+ /**
903
+ * Check if the remaining input starts with any known keyword.
904
+ * Useful for non-space languages to detect word boundaries.
905
+ *
906
+ * @param input - Input string
907
+ * @param pos - Current position
908
+ * @returns true if a keyword starts at this position
909
+ */
910
+ protected isKeywordStart(input: string, pos: number): boolean {
911
+ const remaining = input.slice(pos);
912
+ return this.profileKeywords.some(entry => remaining.startsWith(entry.native));
913
+ }
914
+
915
+ /**
916
+ * Look up a keyword by native word (case-insensitive).
917
+ * O(1) lookup using the keyword map.
918
+ *
919
+ * @param native - Native word to look up
920
+ * @returns KeywordEntry if found, undefined otherwise
921
+ */
922
+ protected lookupKeyword(native: string): KeywordEntry | undefined {
923
+ return this.profileKeywordMap.get(native.toLowerCase());
924
+ }
925
+
926
+ /**
927
+ * Check if a word is a known keyword (case-insensitive).
928
+ * O(1) lookup using the keyword map.
929
+ *
930
+ * @param native - Native word to check
931
+ * @returns true if the word is a keyword
932
+ */
933
+ protected isKeyword(native: string): boolean {
934
+ return this.profileKeywordMap.has(native.toLowerCase());
935
+ }
936
+
937
+ /**
938
+ * Set the morphological normalizer for this tokenizer.
939
+ */
940
+ setNormalizer(normalizer: MorphologicalNormalizer): void {
941
+ this.normalizer = normalizer;
942
+ }
943
+
944
+ /**
945
+ * Try to normalize a word using the morphological normalizer.
946
+ * Returns null if no normalizer is set or normalization fails.
947
+ *
948
+ * Note: We don't check isNormalizable() here because the individual tokenizers
949
+ * historically called normalize() directly without that check. The normalize()
950
+ * method itself handles returning noChange() for words that can't be normalized.
951
+ */
952
+ protected tryNormalize(word: string): NormalizationResult | null {
953
+ if (!this.normalizer) return null;
954
+
955
+ const result = this.normalizer.normalize(word);
956
+
957
+ // Only return if actually normalized (stem differs from input)
958
+ if (result.stem !== word && result.confidence >= 0.7) {
959
+ return result;
960
+ }
961
+
962
+ return null;
963
+ }
964
+
965
+ /**
966
+ * Try morphological normalization and keyword lookup.
967
+ *
968
+ * If the word can be normalized to a stem that matches a known keyword,
969
+ * returns a keyword token with morphological metadata (stem, stemConfidence).
970
+ *
971
+ * This is the common pattern for handling conjugated verbs across languages:
972
+ * 1. Normalize the word (e.g., "toggled" → "toggle")
973
+ * 2. Look up the stem in the keyword map
974
+ * 3. Create a token with both the original form and stem metadata
975
+ *
976
+ * @param word - The word to normalize and look up
977
+ * @param startPos - Start position for the token
978
+ * @param endPos - End position for the token
979
+ * @returns Token if stem matches a keyword, null otherwise
980
+ */
981
+ protected tryMorphKeywordMatch(
982
+ word: string,
983
+ startPos: number,
984
+ endPos: number
985
+ ): LanguageToken | null {
986
+ const result = this.tryNormalize(word);
987
+ if (!result) return null;
988
+
989
+ // Check if the stem is a known keyword
990
+ const stemEntry = this.lookupKeyword(result.stem);
991
+ if (!stemEntry) return null;
992
+
993
+ const tokenOptions: CreateTokenOptions = {
994
+ normalized: stemEntry.normalized,
995
+ stem: result.stem,
996
+ stemConfidence: result.confidence,
997
+ };
998
+ return createToken(word, 'keyword', createPosition(startPos, endPos), tokenOptions);
999
+ }
1000
+
1001
+ /**
1002
+ * Try to extract a CSS selector at the current position.
1003
+ */
1004
+ protected trySelector(input: string, pos: number): LanguageToken | null {
1005
+ const selector = extractCssSelector(input, pos);
1006
+ if (selector) {
1007
+ return createToken(selector, 'selector', createPosition(pos, pos + selector.length));
1008
+ }
1009
+ return null;
1010
+ }
1011
+
1012
+ /**
1013
+ * Try to extract an event modifier at the current position.
1014
+ * Event modifiers are .once, .debounce(N), .throttle(N), .queue(strategy)
1015
+ */
1016
+ protected tryEventModifier(input: string, pos: number): LanguageToken | null {
1017
+ // Must start with a dot
1018
+ if (input[pos] !== '.') {
1019
+ return null;
1020
+ }
1021
+
1022
+ // Match pattern: .(once|debounce|throttle|queue) followed by optional (value)
1023
+ const match = input
1024
+ .slice(pos)
1025
+ .match(/^\.(?:once|debounce|throttle|queue)(?:\(([^)]+)\))?(?:\s|$|\.)/);
1026
+ if (!match) {
1027
+ return null;
1028
+ }
1029
+
1030
+ const fullMatch = match[0].replace(/(\s|\.)$/, ''); // Remove trailing space or dot
1031
+ const modifierName = fullMatch.slice(1).split('(')[0]; // Extract modifier name
1032
+ const value = match[1]; // Extract value from parentheses if present
1033
+
1034
+ // Create token with metadata
1035
+ const token = createToken(
1036
+ fullMatch,
1037
+ 'event-modifier',
1038
+ createPosition(pos, pos + fullMatch.length)
1039
+ );
1040
+
1041
+ // Add metadata for the modifier
1042
+ return {
1043
+ ...token,
1044
+ metadata: {
1045
+ modifierName,
1046
+ value: value ? (modifierName === 'queue' ? value : parseInt(value, 10)) : undefined,
1047
+ },
1048
+ };
1049
+ }
1050
+
1051
+ /**
1052
+ * Try to extract a string literal at the current position.
1053
+ */
1054
+ protected tryString(input: string, pos: number): LanguageToken | null {
1055
+ const literal = extractStringLiteral(input, pos);
1056
+ if (literal) {
1057
+ return createToken(literal, 'literal', createPosition(pos, pos + literal.length));
1058
+ }
1059
+ return null;
1060
+ }
1061
+
1062
+ /**
1063
+ * Try to extract a number at the current position.
1064
+ */
1065
+ protected tryNumber(input: string, pos: number): LanguageToken | null {
1066
+ const number = extractNumber(input, pos);
1067
+ if (number) {
1068
+ return createToken(number, 'literal', createPosition(pos, pos + number.length));
1069
+ }
1070
+ return null;
1071
+ }
1072
+
1073
+ /**
1074
+ * Configuration for native language time units.
1075
+ * Maps patterns to their standard suffix (ms, s, m, h).
1076
+ */
1077
+ protected static readonly STANDARD_TIME_UNITS: readonly TimeUnitMapping[] = [
1078
+ { pattern: 'ms', suffix: 'ms', length: 2 },
1079
+ { pattern: 's', suffix: 's', length: 1, checkBoundary: true },
1080
+ { pattern: 'm', suffix: 'm', length: 1, checkBoundary: true, notFollowedBy: 's' },
1081
+ { pattern: 'h', suffix: 'h', length: 1, checkBoundary: true },
1082
+ ];
1083
+
1084
+ /**
1085
+ * Try to match a time unit from a list of patterns.
1086
+ *
1087
+ * @param input - Input string
1088
+ * @param pos - Position after the number
1089
+ * @param timeUnits - Array of time unit mappings (native pattern → standard suffix)
1090
+ * @param skipWhitespace - Whether to skip whitespace before time unit (default: false)
1091
+ * @returns Object with matched suffix and new position, or null if no match
1092
+ */
1093
+ protected tryMatchTimeUnit(
1094
+ input: string,
1095
+ pos: number,
1096
+ timeUnits: readonly TimeUnitMapping[],
1097
+ skipWhitespace = false
1098
+ ): { suffix: string; endPos: number } | null {
1099
+ let unitPos = pos;
1100
+
1101
+ // Optionally skip whitespace before time unit
1102
+ if (skipWhitespace) {
1103
+ while (unitPos < input.length && isWhitespace(input[unitPos])) {
1104
+ unitPos++;
1105
+ }
1106
+ }
1107
+
1108
+ const remaining = input.slice(unitPos);
1109
+
1110
+ // Check each time unit pattern
1111
+ for (const unit of timeUnits) {
1112
+ const candidate = remaining.slice(0, unit.length);
1113
+ const matches = unit.caseInsensitive
1114
+ ? candidate.toLowerCase() === unit.pattern.toLowerCase()
1115
+ : candidate === unit.pattern;
1116
+
1117
+ if (matches) {
1118
+ // Check notFollowedBy constraint (e.g., 'm' should not match 'ms')
1119
+ if (unit.notFollowedBy) {
1120
+ const nextChar = remaining[unit.length] || '';
1121
+ if (nextChar === unit.notFollowedBy) continue;
1122
+ }
1123
+
1124
+ // Check word boundary if required
1125
+ if (unit.checkBoundary) {
1126
+ const nextChar = remaining[unit.length] || '';
1127
+ if (isAsciiIdentifierChar(nextChar)) continue;
1128
+ }
1129
+
1130
+ return { suffix: unit.suffix, endPos: unitPos + unit.length };
1131
+ }
1132
+ }
1133
+
1134
+ return null;
1135
+ }
1136
+
1137
+ /**
1138
+ * Parse a base number (sign, integer, decimal) without time units.
1139
+ * Returns the number string and end position.
1140
+ *
1141
+ * @param input - Input string
1142
+ * @param startPos - Start position
1143
+ * @param allowSign - Whether to allow +/- sign (default: true)
1144
+ * @returns Object with number string and end position, or null
1145
+ */
1146
+ protected parseBaseNumber(
1147
+ input: string,
1148
+ startPos: number,
1149
+ allowSign = true
1150
+ ): { number: string; endPos: number } | null {
1151
+ let pos = startPos;
1152
+ let number = '';
1153
+
1154
+ // Optional sign
1155
+ if (allowSign && (input[pos] === '-' || input[pos] === '+')) {
1156
+ number += input[pos++];
1157
+ }
1158
+
1159
+ // Must have at least one digit
1160
+ if (pos >= input.length || !isDigit(input[pos])) {
1161
+ return null;
1162
+ }
1163
+
1164
+ // Integer part
1165
+ while (pos < input.length && isDigit(input[pos])) {
1166
+ number += input[pos++];
1167
+ }
1168
+
1169
+ // Optional decimal
1170
+ if (pos < input.length && input[pos] === '.') {
1171
+ number += input[pos++];
1172
+ while (pos < input.length && isDigit(input[pos])) {
1173
+ number += input[pos++];
1174
+ }
1175
+ }
1176
+
1177
+ if (!number || number === '-' || number === '+') return null;
1178
+
1179
+ return { number, endPos: pos };
1180
+ }
1181
+
1182
+ /**
1183
+ * Try to extract a number with native language time units.
1184
+ *
1185
+ * This is a template method that handles the common pattern:
1186
+ * 1. Parse the base number (sign, integer, decimal)
1187
+ * 2. Try to match native language time units
1188
+ * 3. Fall back to standard time units (ms, s, m, h)
1189
+ *
1190
+ * @param input - Input string
1191
+ * @param pos - Start position
1192
+ * @param nativeTimeUnits - Language-specific time unit mappings
1193
+ * @param options - Configuration options
1194
+ * @returns Token if number found, null otherwise
1195
+ */
1196
+ protected tryNumberWithTimeUnits(
1197
+ input: string,
1198
+ pos: number,
1199
+ nativeTimeUnits: readonly TimeUnitMapping[],
1200
+ options: { allowSign?: boolean; skipWhitespace?: boolean } = {}
1201
+ ): LanguageToken | null {
1202
+ const { allowSign = true, skipWhitespace = false } = options;
1203
+
1204
+ // Parse base number
1205
+ const baseResult = this.parseBaseNumber(input, pos, allowSign);
1206
+ if (!baseResult) return null;
1207
+
1208
+ let { number, endPos } = baseResult;
1209
+
1210
+ // Try native time units first, then standard
1211
+ const allUnits = [...nativeTimeUnits, ...BaseTokenizer.STANDARD_TIME_UNITS];
1212
+ const timeMatch = this.tryMatchTimeUnit(input, endPos, allUnits, skipWhitespace);
1213
+
1214
+ if (timeMatch) {
1215
+ number += timeMatch.suffix;
1216
+ endPos = timeMatch.endPos;
1217
+ }
1218
+
1219
+ return createToken(number, 'literal', createPosition(pos, endPos));
1220
+ }
1221
+
1222
+ /**
1223
+ * Try to extract a URL at the current position.
1224
+ * Handles /path, ./path, ../path, //domain.com, http://, https://
1225
+ */
1226
+ protected tryUrl(input: string, pos: number): LanguageToken | null {
1227
+ const url = extractUrl(input, pos);
1228
+ if (url) {
1229
+ return createToken(url, 'url', createPosition(pos, pos + url.length));
1230
+ }
1231
+ return null;
1232
+ }
1233
+
1234
+ /**
1235
+ * Try to extract a variable reference (:varname) at the current position.
1236
+ * In hyperscript, :x refers to a local variable named x.
1237
+ */
1238
+ protected tryVariableRef(input: string, pos: number): LanguageToken | null {
1239
+ if (input[pos] !== ':') return null;
1240
+ if (pos + 1 >= input.length) return null;
1241
+ if (!isAsciiIdentifierChar(input[pos + 1])) return null;
1242
+
1243
+ let endPos = pos + 1;
1244
+ while (endPos < input.length && isAsciiIdentifierChar(input[endPos])) {
1245
+ endPos++;
1246
+ }
1247
+
1248
+ const varRef = input.slice(pos, endPos);
1249
+ return createToken(varRef, 'identifier', createPosition(pos, endPos));
1250
+ }
1251
+
1252
+ /**
1253
+ * Try to extract an operator or punctuation token at the current position.
1254
+ * Handles two-character operators (==, !=, etc.) and single-character operators.
1255
+ */
1256
+ protected tryOperator(input: string, pos: number): LanguageToken | null {
1257
+ // Two-character operators
1258
+ const twoChar = input.slice(pos, pos + 2);
1259
+ if (['==', '!=', '<=', '>=', '&&', '||', '->'].includes(twoChar)) {
1260
+ return createToken(twoChar, 'operator', createPosition(pos, pos + 2));
1261
+ }
1262
+
1263
+ // Single-character operators
1264
+ const oneChar = input[pos];
1265
+ if (['<', '>', '!', '+', '-', '*', '/', '='].includes(oneChar)) {
1266
+ return createToken(oneChar, 'operator', createPosition(pos, pos + 1));
1267
+ }
1268
+
1269
+ // Punctuation
1270
+ if (['(', ')', '{', '}', ',', ';', ':'].includes(oneChar)) {
1271
+ return createToken(oneChar, 'punctuation', createPosition(pos, pos + 1));
1272
+ }
1273
+
1274
+ return null;
1275
+ }
1276
+
1277
+ /**
1278
+ * Try to match a multi-character particle from a list.
1279
+ *
1280
+ * Used by languages like Japanese, Korean, and Chinese that have
1281
+ * multi-character particles (e.g., Japanese から, まで, より).
1282
+ *
1283
+ * @param input - Input string
1284
+ * @param pos - Current position
1285
+ * @param particles - Array of multi-character particles to match
1286
+ * @returns Token if matched, null otherwise
1287
+ */
1288
+ protected tryMultiCharParticle(
1289
+ input: string,
1290
+ pos: number,
1291
+ particles: readonly string[]
1292
+ ): LanguageToken | null {
1293
+ for (const particle of particles) {
1294
+ if (input.slice(pos, pos + particle.length) === particle) {
1295
+ return createToken(particle, 'particle', createPosition(pos, pos + particle.length));
1296
+ }
1297
+ }
1298
+ return null;
1299
+ }
1300
+ }