@lokascript/semantic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +686 -0
  3. package/dist/browser-ar.ar.global.js +2 -0
  4. package/dist/browser-core.core.global.js +2 -0
  5. package/dist/browser-de.de.global.js +2 -0
  6. package/dist/browser-east-asian.east-asian.global.js +2 -0
  7. package/dist/browser-en-tr.en-tr.global.js +2 -0
  8. package/dist/browser-en.en.global.js +2 -0
  9. package/dist/browser-es-en.es-en.global.js +2 -0
  10. package/dist/browser-es.es.global.js +2 -0
  11. package/dist/browser-fr.fr.global.js +2 -0
  12. package/dist/browser-id.id.global.js +2 -0
  13. package/dist/browser-ja.ja.global.js +2 -0
  14. package/dist/browser-ko.ko.global.js +2 -0
  15. package/dist/browser-lazy.lazy.global.js +2 -0
  16. package/dist/browser-priority.priority.global.js +2 -0
  17. package/dist/browser-pt.pt.global.js +2 -0
  18. package/dist/browser-qu.qu.global.js +2 -0
  19. package/dist/browser-sw.sw.global.js +2 -0
  20. package/dist/browser-tr.tr.global.js +2 -0
  21. package/dist/browser-western.western.global.js +2 -0
  22. package/dist/browser-zh.zh.global.js +2 -0
  23. package/dist/browser.global.js +3 -0
  24. package/dist/browser.global.js.map +1 -0
  25. package/dist/index.cjs +35051 -0
  26. package/dist/index.cjs.map +1 -0
  27. package/dist/index.d.cts +3426 -0
  28. package/dist/index.d.ts +3426 -0
  29. package/dist/index.js +34890 -0
  30. package/dist/index.js.map +1 -0
  31. package/dist/languages/ar.d.ts +78 -0
  32. package/dist/languages/ar.js +1622 -0
  33. package/dist/languages/ar.js.map +1 -0
  34. package/dist/languages/de.d.ts +38 -0
  35. package/dist/languages/de.js +1168 -0
  36. package/dist/languages/de.js.map +1 -0
  37. package/dist/languages/en.d.ts +44 -0
  38. package/dist/languages/en.js +3491 -0
  39. package/dist/languages/en.js.map +1 -0
  40. package/dist/languages/es.d.ts +52 -0
  41. package/dist/languages/es.js +1493 -0
  42. package/dist/languages/es.js.map +1 -0
  43. package/dist/languages/fr.d.ts +37 -0
  44. package/dist/languages/fr.js +1159 -0
  45. package/dist/languages/fr.js.map +1 -0
  46. package/dist/languages/id.d.ts +35 -0
  47. package/dist/languages/id.js +1152 -0
  48. package/dist/languages/id.js.map +1 -0
  49. package/dist/languages/ja.d.ts +53 -0
  50. package/dist/languages/ja.js +1430 -0
  51. package/dist/languages/ja.js.map +1 -0
  52. package/dist/languages/ko.d.ts +51 -0
  53. package/dist/languages/ko.js +1729 -0
  54. package/dist/languages/ko.js.map +1 -0
  55. package/dist/languages/pt.d.ts +37 -0
  56. package/dist/languages/pt.js +1127 -0
  57. package/dist/languages/pt.js.map +1 -0
  58. package/dist/languages/qu.d.ts +36 -0
  59. package/dist/languages/qu.js +1143 -0
  60. package/dist/languages/qu.js.map +1 -0
  61. package/dist/languages/sw.d.ts +35 -0
  62. package/dist/languages/sw.js +1147 -0
  63. package/dist/languages/sw.js.map +1 -0
  64. package/dist/languages/tr.d.ts +45 -0
  65. package/dist/languages/tr.js +1529 -0
  66. package/dist/languages/tr.js.map +1 -0
  67. package/dist/languages/zh.d.ts +58 -0
  68. package/dist/languages/zh.js +1257 -0
  69. package/dist/languages/zh.js.map +1 -0
  70. package/dist/types-C4dcj53L.d.ts +600 -0
  71. package/package.json +202 -0
  72. package/src/__test-utils__/index.ts +7 -0
  73. package/src/__test-utils__/test-helpers.ts +8 -0
  74. package/src/__types__/test-helpers.ts +122 -0
  75. package/src/analysis/index.ts +479 -0
  76. package/src/ast-builder/command-mappers.ts +1133 -0
  77. package/src/ast-builder/expression-parser/index.ts +41 -0
  78. package/src/ast-builder/expression-parser/parser.ts +563 -0
  79. package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
  80. package/src/ast-builder/expression-parser/types.ts +208 -0
  81. package/src/ast-builder/index.ts +536 -0
  82. package/src/ast-builder/value-converters.ts +172 -0
  83. package/src/bridge.ts +275 -0
  84. package/src/browser-ar.ts +162 -0
  85. package/src/browser-core.ts +231 -0
  86. package/src/browser-de.ts +162 -0
  87. package/src/browser-east-asian.ts +173 -0
  88. package/src/browser-en-tr.ts +165 -0
  89. package/src/browser-en.ts +157 -0
  90. package/src/browser-es-en.ts +200 -0
  91. package/src/browser-es.ts +170 -0
  92. package/src/browser-fr.ts +162 -0
  93. package/src/browser-id.ts +162 -0
  94. package/src/browser-ja.ts +162 -0
  95. package/src/browser-ko.ts +162 -0
  96. package/src/browser-lazy.ts +189 -0
  97. package/src/browser-priority.ts +214 -0
  98. package/src/browser-pt.ts +162 -0
  99. package/src/browser-qu.ts +162 -0
  100. package/src/browser-sw.ts +162 -0
  101. package/src/browser-tr.ts +162 -0
  102. package/src/browser-western.ts +181 -0
  103. package/src/browser-zh.ts +162 -0
  104. package/src/browser.ts +268 -0
  105. package/src/cache/index.ts +14 -0
  106. package/src/cache/semantic-cache.ts +344 -0
  107. package/src/core-bridge.ts +372 -0
  108. package/src/explicit/converter.ts +258 -0
  109. package/src/explicit/index.ts +18 -0
  110. package/src/explicit/parser.ts +236 -0
  111. package/src/explicit/renderer.ts +424 -0
  112. package/src/generators/command-schemas.ts +1636 -0
  113. package/src/generators/event-handler-generator.ts +109 -0
  114. package/src/generators/index.ts +117 -0
  115. package/src/generators/language-profiles.ts +139 -0
  116. package/src/generators/pattern-generator.ts +537 -0
  117. package/src/generators/profiles/arabic.ts +131 -0
  118. package/src/generators/profiles/bengali.ts +132 -0
  119. package/src/generators/profiles/chinese.ts +124 -0
  120. package/src/generators/profiles/english.ts +113 -0
  121. package/src/generators/profiles/french.ts +125 -0
  122. package/src/generators/profiles/german.ts +126 -0
  123. package/src/generators/profiles/hindi.ts +146 -0
  124. package/src/generators/profiles/index.ts +46 -0
  125. package/src/generators/profiles/indonesian.ts +125 -0
  126. package/src/generators/profiles/italian.ts +139 -0
  127. package/src/generators/profiles/japanese.ts +149 -0
  128. package/src/generators/profiles/korean.ts +127 -0
  129. package/src/generators/profiles/marker-templates.ts +288 -0
  130. package/src/generators/profiles/ms.ts +130 -0
  131. package/src/generators/profiles/polish.ts +249 -0
  132. package/src/generators/profiles/portuguese.ts +115 -0
  133. package/src/generators/profiles/quechua.ts +113 -0
  134. package/src/generators/profiles/russian.ts +260 -0
  135. package/src/generators/profiles/spanish.ts +130 -0
  136. package/src/generators/profiles/swahili.ts +129 -0
  137. package/src/generators/profiles/thai.ts +132 -0
  138. package/src/generators/profiles/tl.ts +128 -0
  139. package/src/generators/profiles/turkish.ts +124 -0
  140. package/src/generators/profiles/types.ts +165 -0
  141. package/src/generators/profiles/ukrainian.ts +270 -0
  142. package/src/generators/profiles/vietnamese.ts +133 -0
  143. package/src/generators/schema-error-codes.ts +160 -0
  144. package/src/generators/schema-validator.ts +391 -0
  145. package/src/index.ts +429 -0
  146. package/src/language-building-schema.ts +3170 -0
  147. package/src/language-loader.ts +394 -0
  148. package/src/languages/_all.ts +65 -0
  149. package/src/languages/ar.ts +15 -0
  150. package/src/languages/bn.ts +16 -0
  151. package/src/languages/de.ts +15 -0
  152. package/src/languages/en.ts +29 -0
  153. package/src/languages/es.ts +15 -0
  154. package/src/languages/fr.ts +15 -0
  155. package/src/languages/hi.ts +26 -0
  156. package/src/languages/id.ts +15 -0
  157. package/src/languages/index.ts +18 -0
  158. package/src/languages/it.ts +15 -0
  159. package/src/languages/ja.ts +15 -0
  160. package/src/languages/ko.ts +15 -0
  161. package/src/languages/ms.ts +16 -0
  162. package/src/languages/pl.ts +18 -0
  163. package/src/languages/pt.ts +15 -0
  164. package/src/languages/qu.ts +15 -0
  165. package/src/languages/ru.ts +26 -0
  166. package/src/languages/sw.ts +15 -0
  167. package/src/languages/th.ts +16 -0
  168. package/src/languages/tl.ts +16 -0
  169. package/src/languages/tr.ts +15 -0
  170. package/src/languages/uk.ts +26 -0
  171. package/src/languages/vi.ts +16 -0
  172. package/src/languages/zh.ts +15 -0
  173. package/src/parser/index.ts +15 -0
  174. package/src/parser/pattern-matcher.ts +1181 -0
  175. package/src/parser/semantic-parser.ts +573 -0
  176. package/src/parser/utils/index.ts +35 -0
  177. package/src/parser/utils/marker-resolution.ts +111 -0
  178. package/src/parser/utils/possessive-keywords.ts +43 -0
  179. package/src/parser/utils/role-positioning.ts +70 -0
  180. package/src/parser/utils/type-validation.ts +134 -0
  181. package/src/patterns/add/ar.ts +71 -0
  182. package/src/patterns/add/bn.ts +70 -0
  183. package/src/patterns/add/hi.ts +69 -0
  184. package/src/patterns/add/index.ts +87 -0
  185. package/src/patterns/add/it.ts +61 -0
  186. package/src/patterns/add/ja.ts +93 -0
  187. package/src/patterns/add/ko.ts +74 -0
  188. package/src/patterns/add/ms.ts +30 -0
  189. package/src/patterns/add/pl.ts +62 -0
  190. package/src/patterns/add/ru.ts +62 -0
  191. package/src/patterns/add/th.ts +49 -0
  192. package/src/patterns/add/tl.ts +30 -0
  193. package/src/patterns/add/tr.ts +71 -0
  194. package/src/patterns/add/uk.ts +62 -0
  195. package/src/patterns/add/vi.ts +61 -0
  196. package/src/patterns/add/zh.ts +71 -0
  197. package/src/patterns/builders.ts +207 -0
  198. package/src/patterns/decrement/bn.ts +70 -0
  199. package/src/patterns/decrement/de.ts +42 -0
  200. package/src/patterns/decrement/hi.ts +68 -0
  201. package/src/patterns/decrement/index.ts +79 -0
  202. package/src/patterns/decrement/it.ts +69 -0
  203. package/src/patterns/decrement/ms.ts +30 -0
  204. package/src/patterns/decrement/pl.ts +58 -0
  205. package/src/patterns/decrement/ru.ts +58 -0
  206. package/src/patterns/decrement/th.ts +49 -0
  207. package/src/patterns/decrement/tl.ts +30 -0
  208. package/src/patterns/decrement/tr.ts +48 -0
  209. package/src/patterns/decrement/uk.ts +58 -0
  210. package/src/patterns/decrement/vi.ts +61 -0
  211. package/src/patterns/decrement/zh.ts +32 -0
  212. package/src/patterns/en.ts +302 -0
  213. package/src/patterns/event-handler/ar.ts +151 -0
  214. package/src/patterns/event-handler/bn.ts +72 -0
  215. package/src/patterns/event-handler/de.ts +117 -0
  216. package/src/patterns/event-handler/en.ts +117 -0
  217. package/src/patterns/event-handler/es.ts +136 -0
  218. package/src/patterns/event-handler/fr.ts +117 -0
  219. package/src/patterns/event-handler/hi.ts +64 -0
  220. package/src/patterns/event-handler/id.ts +117 -0
  221. package/src/patterns/event-handler/index.ts +119 -0
  222. package/src/patterns/event-handler/it.ts +54 -0
  223. package/src/patterns/event-handler/ja.ts +118 -0
  224. package/src/patterns/event-handler/ko.ts +133 -0
  225. package/src/patterns/event-handler/ms.ts +30 -0
  226. package/src/patterns/event-handler/pl.ts +62 -0
  227. package/src/patterns/event-handler/pt.ts +117 -0
  228. package/src/patterns/event-handler/qu.ts +66 -0
  229. package/src/patterns/event-handler/ru.ts +62 -0
  230. package/src/patterns/event-handler/shared.ts +270 -0
  231. package/src/patterns/event-handler/sw.ts +117 -0
  232. package/src/patterns/event-handler/th.ts +53 -0
  233. package/src/patterns/event-handler/tl.ts +30 -0
  234. package/src/patterns/event-handler/tr.ts +170 -0
  235. package/src/patterns/event-handler/uk.ts +62 -0
  236. package/src/patterns/event-handler/vi.ts +61 -0
  237. package/src/patterns/event-handler/zh.ts +150 -0
  238. package/src/patterns/get/ar.ts +49 -0
  239. package/src/patterns/get/bn.ts +47 -0
  240. package/src/patterns/get/de.ts +32 -0
  241. package/src/patterns/get/hi.ts +52 -0
  242. package/src/patterns/get/index.ts +83 -0
  243. package/src/patterns/get/it.ts +56 -0
  244. package/src/patterns/get/ja.ts +53 -0
  245. package/src/patterns/get/ko.ts +53 -0
  246. package/src/patterns/get/ms.ts +30 -0
  247. package/src/patterns/get/pl.ts +57 -0
  248. package/src/patterns/get/ru.ts +57 -0
  249. package/src/patterns/get/th.ts +29 -0
  250. package/src/patterns/get/tl.ts +30 -0
  251. package/src/patterns/get/uk.ts +57 -0
  252. package/src/patterns/get/vi.ts +48 -0
  253. package/src/patterns/grammar-transformed/index.ts +39 -0
  254. package/src/patterns/grammar-transformed/ja.ts +1713 -0
  255. package/src/patterns/grammar-transformed/ko.ts +1311 -0
  256. package/src/patterns/grammar-transformed/tr.ts +1067 -0
  257. package/src/patterns/hide/ar.ts +67 -0
  258. package/src/patterns/hide/bn.ts +47 -0
  259. package/src/patterns/hide/de.ts +36 -0
  260. package/src/patterns/hide/hi.ts +61 -0
  261. package/src/patterns/hide/index.ts +91 -0
  262. package/src/patterns/hide/it.ts +56 -0
  263. package/src/patterns/hide/ja.ts +69 -0
  264. package/src/patterns/hide/ko.ts +69 -0
  265. package/src/patterns/hide/ms.ts +30 -0
  266. package/src/patterns/hide/pl.ts +57 -0
  267. package/src/patterns/hide/ru.ts +57 -0
  268. package/src/patterns/hide/th.ts +29 -0
  269. package/src/patterns/hide/tl.ts +30 -0
  270. package/src/patterns/hide/tr.ts +65 -0
  271. package/src/patterns/hide/uk.ts +57 -0
  272. package/src/patterns/hide/vi.ts +56 -0
  273. package/src/patterns/hide/zh.ts +68 -0
  274. package/src/patterns/increment/bn.ts +70 -0
  275. package/src/patterns/increment/de.ts +36 -0
  276. package/src/patterns/increment/hi.ts +68 -0
  277. package/src/patterns/increment/index.ts +79 -0
  278. package/src/patterns/increment/it.ts +69 -0
  279. package/src/patterns/increment/ms.ts +30 -0
  280. package/src/patterns/increment/pl.ts +58 -0
  281. package/src/patterns/increment/ru.ts +58 -0
  282. package/src/patterns/increment/th.ts +49 -0
  283. package/src/patterns/increment/tl.ts +30 -0
  284. package/src/patterns/increment/tr.ts +52 -0
  285. package/src/patterns/increment/uk.ts +58 -0
  286. package/src/patterns/increment/vi.ts +61 -0
  287. package/src/patterns/increment/zh.ts +32 -0
  288. package/src/patterns/index.ts +84 -0
  289. package/src/patterns/languages/en/control-flow.ts +93 -0
  290. package/src/patterns/languages/en/fetch.ts +62 -0
  291. package/src/patterns/languages/en/index.ts +42 -0
  292. package/src/patterns/languages/en/repeat.ts +67 -0
  293. package/src/patterns/languages/en/set.ts +48 -0
  294. package/src/patterns/languages/en/swap.ts +38 -0
  295. package/src/patterns/languages/en/temporal.ts +57 -0
  296. package/src/patterns/put/ar.ts +74 -0
  297. package/src/patterns/put/bn.ts +53 -0
  298. package/src/patterns/put/en.ts +74 -0
  299. package/src/patterns/put/es.ts +74 -0
  300. package/src/patterns/put/hi.ts +69 -0
  301. package/src/patterns/put/id.ts +96 -0
  302. package/src/patterns/put/index.ts +99 -0
  303. package/src/patterns/put/it.ts +56 -0
  304. package/src/patterns/put/ja.ts +75 -0
  305. package/src/patterns/put/ko.ts +67 -0
  306. package/src/patterns/put/ms.ts +30 -0
  307. package/src/patterns/put/pl.ts +81 -0
  308. package/src/patterns/put/ru.ts +85 -0
  309. package/src/patterns/put/th.ts +32 -0
  310. package/src/patterns/put/tl.ts +30 -0
  311. package/src/patterns/put/tr.ts +67 -0
  312. package/src/patterns/put/uk.ts +85 -0
  313. package/src/patterns/put/vi.ts +72 -0
  314. package/src/patterns/put/zh.ts +62 -0
  315. package/src/patterns/registry.ts +163 -0
  316. package/src/patterns/remove/ar.ts +71 -0
  317. package/src/patterns/remove/bn.ts +68 -0
  318. package/src/patterns/remove/hi.ts +69 -0
  319. package/src/patterns/remove/index.ts +87 -0
  320. package/src/patterns/remove/it.ts +69 -0
  321. package/src/patterns/remove/ja.ts +74 -0
  322. package/src/patterns/remove/ko.ts +78 -0
  323. package/src/patterns/remove/ms.ts +30 -0
  324. package/src/patterns/remove/pl.ts +62 -0
  325. package/src/patterns/remove/ru.ts +62 -0
  326. package/src/patterns/remove/th.ts +49 -0
  327. package/src/patterns/remove/tl.ts +30 -0
  328. package/src/patterns/remove/tr.ts +78 -0
  329. package/src/patterns/remove/uk.ts +62 -0
  330. package/src/patterns/remove/vi.ts +61 -0
  331. package/src/patterns/remove/zh.ts +72 -0
  332. package/src/patterns/set/ar.ts +84 -0
  333. package/src/patterns/set/bn.ts +53 -0
  334. package/src/patterns/set/de.ts +84 -0
  335. package/src/patterns/set/es.ts +92 -0
  336. package/src/patterns/set/fr.ts +88 -0
  337. package/src/patterns/set/hi.ts +56 -0
  338. package/src/patterns/set/id.ts +84 -0
  339. package/src/patterns/set/index.ts +107 -0
  340. package/src/patterns/set/it.ts +56 -0
  341. package/src/patterns/set/ja.ts +86 -0
  342. package/src/patterns/set/ko.ts +85 -0
  343. package/src/patterns/set/ms.ts +30 -0
  344. package/src/patterns/set/pl.ts +57 -0
  345. package/src/patterns/set/pt.ts +84 -0
  346. package/src/patterns/set/ru.ts +57 -0
  347. package/src/patterns/set/th.ts +31 -0
  348. package/src/patterns/set/tl.ts +30 -0
  349. package/src/patterns/set/tr.ts +107 -0
  350. package/src/patterns/set/uk.ts +57 -0
  351. package/src/patterns/set/vi.ts +53 -0
  352. package/src/patterns/set/zh.ts +84 -0
  353. package/src/patterns/show/ar.ts +67 -0
  354. package/src/patterns/show/bn.ts +47 -0
  355. package/src/patterns/show/de.ts +32 -0
  356. package/src/patterns/show/fr.ts +32 -0
  357. package/src/patterns/show/hi.ts +61 -0
  358. package/src/patterns/show/index.ts +95 -0
  359. package/src/patterns/show/it.ts +56 -0
  360. package/src/patterns/show/ja.ts +69 -0
  361. package/src/patterns/show/ko.ts +73 -0
  362. package/src/patterns/show/ms.ts +30 -0
  363. package/src/patterns/show/pl.ts +57 -0
  364. package/src/patterns/show/ru.ts +57 -0
  365. package/src/patterns/show/th.ts +29 -0
  366. package/src/patterns/show/tl.ts +30 -0
  367. package/src/patterns/show/tr.ts +65 -0
  368. package/src/patterns/show/uk.ts +57 -0
  369. package/src/patterns/show/vi.ts +56 -0
  370. package/src/patterns/show/zh.ts +68 -0
  371. package/src/patterns/take/ar.ts +51 -0
  372. package/src/patterns/take/index.ts +31 -0
  373. package/src/patterns/toggle/ar.ts +61 -0
  374. package/src/patterns/toggle/bn.ts +70 -0
  375. package/src/patterns/toggle/en.ts +61 -0
  376. package/src/patterns/toggle/es.ts +61 -0
  377. package/src/patterns/toggle/hi.ts +80 -0
  378. package/src/patterns/toggle/index.ts +95 -0
  379. package/src/patterns/toggle/it.ts +69 -0
  380. package/src/patterns/toggle/ja.ts +156 -0
  381. package/src/patterns/toggle/ko.ts +113 -0
  382. package/src/patterns/toggle/ms.ts +30 -0
  383. package/src/patterns/toggle/pl.ts +62 -0
  384. package/src/patterns/toggle/ru.ts +62 -0
  385. package/src/patterns/toggle/th.ts +50 -0
  386. package/src/patterns/toggle/tl.ts +30 -0
  387. package/src/patterns/toggle/tr.ts +88 -0
  388. package/src/patterns/toggle/uk.ts +62 -0
  389. package/src/patterns/toggle/vi.ts +61 -0
  390. package/src/patterns/toggle/zh.ts +99 -0
  391. package/src/public-api.ts +286 -0
  392. package/src/registry.ts +441 -0
  393. package/src/tokenizers/arabic.ts +723 -0
  394. package/src/tokenizers/base.ts +1300 -0
  395. package/src/tokenizers/bengali.ts +289 -0
  396. package/src/tokenizers/chinese.ts +481 -0
  397. package/src/tokenizers/english.ts +416 -0
  398. package/src/tokenizers/french.ts +326 -0
  399. package/src/tokenizers/german.ts +324 -0
  400. package/src/tokenizers/hindi.ts +319 -0
  401. package/src/tokenizers/index.ts +127 -0
  402. package/src/tokenizers/indonesian.ts +306 -0
  403. package/src/tokenizers/italian.ts +458 -0
  404. package/src/tokenizers/japanese.ts +447 -0
  405. package/src/tokenizers/korean.ts +642 -0
  406. package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
  407. package/src/tokenizers/morphology/french-normalizer.ts +268 -0
  408. package/src/tokenizers/morphology/german-normalizer.ts +256 -0
  409. package/src/tokenizers/morphology/index.ts +46 -0
  410. package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
  411. package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
  412. package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
  413. package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
  414. package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
  415. package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
  416. package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
  417. package/src/tokenizers/morphology/types.ts +211 -0
  418. package/src/tokenizers/ms.ts +198 -0
  419. package/src/tokenizers/polish.ts +354 -0
  420. package/src/tokenizers/portuguese.ts +304 -0
  421. package/src/tokenizers/quechua.ts +339 -0
  422. package/src/tokenizers/russian.ts +375 -0
  423. package/src/tokenizers/spanish.ts +403 -0
  424. package/src/tokenizers/swahili.ts +303 -0
  425. package/src/tokenizers/thai.ts +236 -0
  426. package/src/tokenizers/tl.ts +198 -0
  427. package/src/tokenizers/turkish.ts +411 -0
  428. package/src/tokenizers/ukrainian.ts +369 -0
  429. package/src/tokenizers/vietnamese.ts +410 -0
  430. package/src/types/grammar-types.ts +617 -0
  431. package/src/types/unified-profile.ts +267 -0
  432. package/src/types.ts +709 -0
  433. package/src/utils/confidence-calculator.ts +147 -0
  434. package/src/validators/command-validator.ts +380 -0
  435. package/src/validators/index.ts +15 -0
@@ -0,0 +1,723 @@
1
+ /**
2
+ * Arabic Tokenizer
3
+ *
4
+ * Tokenizes Arabic hyperscript input.
5
+ * Arabic is challenging because:
6
+ * - Right-to-left (RTL) text direction
7
+ * - Prefix prepositions that attach to words (بـ, لـ, كـ)
8
+ * - Root-pattern morphology
9
+ * - CSS selectors are LTR islands within RTL text
10
+ */
11
+
12
+ import type { LanguageToken, TokenKind, TokenStream } from '../types';
13
+ import {
14
+ BaseTokenizer,
15
+ TokenStreamImpl,
16
+ createToken,
17
+ createPosition,
18
+ createUnicodeRangeClassifier,
19
+ isWhitespace,
20
+ isSelectorStart,
21
+ isQuote,
22
+ isDigit,
23
+ isAsciiIdentifierChar,
24
+ isUrlStart,
25
+ type KeywordEntry,
26
+ type TimeUnitMapping,
27
+ } from './base';
28
+ import { ArabicMorphologicalNormalizer } from './morphology/arabic-normalizer';
29
+ import { arabicProfile } from '../generators/profiles/arabic';
30
+
31
+ // =============================================================================
32
+ // Arabic Character Classification
33
+ // =============================================================================
34
+
35
+ /** Check if character is Arabic (includes all Arabic Unicode blocks). */
36
+ const isArabic = createUnicodeRangeClassifier([
37
+ [0x0600, 0x06ff], // Arabic
38
+ [0x0750, 0x077f], // Arabic Supplement
39
+ [0x08a0, 0x08ff], // Arabic Extended-A
40
+ [0xfb50, 0xfdff], // Arabic Presentation Forms-A
41
+ [0xfe70, 0xfeff], // Arabic Presentation Forms-B
42
+ ]);
43
+
44
+ // =============================================================================
45
+ // Arabic Prefixes and Prepositions
46
+ // =============================================================================
47
+
48
+ /**
49
+ * Arabic prefix prepositions that attach to the following word.
50
+ * These are marked with trailing hyphen in patterns to indicate attachment.
51
+ */
52
+ const ATTACHED_PREFIXES = new Set([
53
+ 'بـ', // bi- (with, by)
54
+ 'لـ', // li- (to, for)
55
+ 'كـ', // ka- (like, as)
56
+ 'وـ', // wa- (and)
57
+ ]);
58
+
59
+ /**
60
+ * Arabic proclitic conjunctions and prefixes that attach directly to the following word.
61
+ * These are separated during tokenization for proper list/coordination handling.
62
+ *
63
+ * Single-character proclitics (و, ف) are emitted as separate conjunction tokens
64
+ * to support polysyndetic coordination (A وB وC).
65
+ *
66
+ * Attached prefixes (بـ, لـ, كـ) are prepositions that attach to words.
67
+ * Multi-proclitic sequences (ولـ, وبـ, فلـ, etc.) are split into components.
68
+ *
69
+ * @see NATIVE_REVIEW_NEEDED.md for implementation details
70
+ */
71
+ const PROCLITICS = new Map<string, { normalized: string; type: 'conjunction' | 'preposition' }>([
72
+ // Conjunctions (single character)
73
+ ['و', { normalized: 'and', type: 'conjunction' }], // wa - conjunction "and"
74
+ ['ف', { normalized: 'then', type: 'conjunction' }], // fa - conjunction "then/so"
75
+
76
+ // Attached prefix prepositions
77
+ ['ب', { normalized: 'with', type: 'preposition' }], // bi- (with, by)
78
+ ['ل', { normalized: 'to', type: 'preposition' }], // li- (to, for)
79
+ ['ك', { normalized: 'like', type: 'preposition' }], // ka- (like, as)
80
+
81
+ // Multi-proclitic sequences (conjunction + preposition)
82
+ ['ول', { normalized: 'and-to', type: 'conjunction' }], // wa + li-
83
+ ['وب', { normalized: 'and-with', type: 'conjunction' }], // wa + bi-
84
+ ['وك', { normalized: 'and-like', type: 'conjunction' }], // wa + ka-
85
+ ['فل', { normalized: 'then-to', type: 'conjunction' }], // fa + li-
86
+ ['فب', { normalized: 'then-with', type: 'conjunction' }], // fa + bi-
87
+ ['فك', { normalized: 'then-like', type: 'conjunction' }], // fa + ka-
88
+ ]);
89
+
90
+ /**
91
+ * Arabic temporal markers (event trigger keywords) with formality and confidence tracking.
92
+ *
93
+ * Formality levels:
94
+ * - 'formal': Modern Standard Arabic (MSA) - preferred in written/formal contexts
95
+ * - 'neutral': Common in both MSA and dialects
96
+ * - 'dialectal': Informal/colloquial - common in spoken Arabic
97
+ *
98
+ * Confidence reflects how reliably the marker indicates an event trigger ("on" event).
99
+ * Formal markers have higher confidence due to standardization.
100
+ */
101
+ interface TemporalMarkerMetadata {
102
+ readonly normalized: string;
103
+ readonly formality: 'formal' | 'neutral' | 'dialectal';
104
+ readonly confidence: number;
105
+ readonly description: string;
106
+ }
107
+
108
+ const TEMPORAL_MARKERS = new Map<string, TemporalMarkerMetadata>([
109
+ [
110
+ 'عندما',
111
+ {
112
+ normalized: 'on',
113
+ formality: 'formal',
114
+ confidence: 0.95,
115
+ description: 'when (formal MSA)',
116
+ },
117
+ ],
118
+ [
119
+ 'حينما',
120
+ {
121
+ normalized: 'on',
122
+ formality: 'formal',
123
+ confidence: 0.93,
124
+ description: 'when/whenever (formal)',
125
+ },
126
+ ],
127
+ [
128
+ 'عند',
129
+ {
130
+ normalized: 'on',
131
+ formality: 'neutral',
132
+ confidence: 0.88,
133
+ description: 'at/when (neutral)',
134
+ },
135
+ ],
136
+ [
137
+ 'حين',
138
+ {
139
+ normalized: 'on',
140
+ formality: 'neutral',
141
+ confidence: 0.85,
142
+ description: 'when/time (neutral)',
143
+ },
144
+ ],
145
+ [
146
+ 'لمّا',
147
+ {
148
+ normalized: 'on',
149
+ formality: 'dialectal',
150
+ confidence: 0.7,
151
+ description: 'when (dialectal, with shadda)',
152
+ },
153
+ ],
154
+ [
155
+ 'لما',
156
+ {
157
+ normalized: 'on',
158
+ formality: 'dialectal',
159
+ confidence: 0.68,
160
+ description: 'when (dialectal, no diacritic)',
161
+ },
162
+ ],
163
+ [
164
+ 'لدى',
165
+ {
166
+ normalized: 'on',
167
+ formality: 'neutral',
168
+ confidence: 0.82,
169
+ description: 'at/with (temporal)',
170
+ },
171
+ ],
172
+ ]);
173
+
174
+ /**
175
+ * Arabic standalone prepositions.
176
+ * Note: Temporal markers (عند, لدى, etc.) are NOT in this set - they're handled
177
+ * separately in TEMPORAL_MARKERS with formality metadata.
178
+ */
179
+ const PREPOSITIONS = new Set([
180
+ 'في', // fī (in)
181
+ 'على', // ʿalā (on)
182
+ 'من', // min (from)
183
+ 'إلى', // ilā (to)
184
+ 'الى', // ilā (alternative spelling)
185
+ // 'عند' removed - it's a temporal marker with metadata
186
+ 'مع', // maʿa (with)
187
+ 'عن', // ʿan (about, from)
188
+ 'قبل', // qabl (before)
189
+ 'بعد', // baʿd (after)
190
+ 'بين', // bayn (between)
191
+ ]);
192
+
193
+ // =============================================================================
194
+ // Arabic Extras (keywords not in profile)
195
+ // =============================================================================
196
+
197
+ /**
198
+ * Extra keywords not covered by the profile:
199
+ * - Literals (true, false, null, undefined)
200
+ * - Positional words
201
+ * - Event names
202
+ * - Time units
203
+ * - Temporal conjunctions
204
+ * - Additional synonyms and spelling variants
205
+ */
206
+ const ARABIC_EXTRAS: KeywordEntry[] = [
207
+ // Values/Literals
208
+ { native: 'صحيح', normalized: 'true' },
209
+ { native: 'خطأ', normalized: 'false' },
210
+ { native: 'null', normalized: 'null' },
211
+ { native: 'فارغ', normalized: 'null' },
212
+ { native: 'غير معرف', normalized: 'undefined' },
213
+
214
+ // Positional
215
+ { native: 'الأول', normalized: 'first' },
216
+ { native: 'أول', normalized: 'first' },
217
+ { native: 'الأخير', normalized: 'last' },
218
+ { native: 'آخر', normalized: 'last' },
219
+ { native: 'التالي', normalized: 'next' },
220
+ { native: 'السابق', normalized: 'previous' },
221
+ { native: 'الأقرب', normalized: 'closest' },
222
+ { native: 'الأب', normalized: 'parent' },
223
+
224
+ // Events
225
+ { native: 'النقر', normalized: 'click' },
226
+ { native: 'نقر', normalized: 'click' },
227
+ { native: 'الإدخال', normalized: 'input' },
228
+ { native: 'إدخال', normalized: 'input' },
229
+ { native: 'التغيير', normalized: 'change' },
230
+ { native: 'تغيير', normalized: 'change' },
231
+ { native: 'الإرسال', normalized: 'submit' },
232
+ { native: 'إرسال', normalized: 'submit' },
233
+ { native: 'التركيز', normalized: 'focus' },
234
+ { native: 'فقدان التركيز', normalized: 'blur' },
235
+ { native: 'ضغط', normalized: 'keydown' },
236
+ { native: 'رفع', normalized: 'keyup' },
237
+ { native: 'تمرير الفأرة', normalized: 'mouseover' },
238
+ { native: 'مغادرة الفأرة', normalized: 'mouseout' },
239
+ { native: 'تحميل', normalized: 'load' },
240
+ { native: 'تمرير', normalized: 'scroll' },
241
+
242
+ // References
243
+ { native: 'أنا', normalized: 'me' },
244
+ { native: 'هو', normalized: 'it' },
245
+ { native: 'هي', normalized: 'it' },
246
+ { native: 'النتيجة', normalized: 'result' },
247
+ { native: 'الحدث', normalized: 'event' },
248
+ { native: 'الهدف', normalized: 'target' },
249
+
250
+ // Time units
251
+ { native: 'ثانية', normalized: 's' },
252
+ { native: 'ثواني', normalized: 's' },
253
+ { native: 'ملي ثانية', normalized: 'ms' },
254
+ { native: 'دقيقة', normalized: 'm' },
255
+ { native: 'دقائق', normalized: 'm' },
256
+ { native: 'ساعة', normalized: 'h' },
257
+ { native: 'ساعات', normalized: 'h' },
258
+
259
+ // Note: Temporal markers (عندما, حينما, etc.) are in TEMPORAL_MARKERS map
260
+ // with formality metadata, not in ARABIC_EXTRAS
261
+
262
+ // Additional spelling variants (without diacritics)
263
+ { native: 'بدل', normalized: 'toggle' },
264
+ { native: 'غير', normalized: 'toggle' },
265
+ { native: 'اضف', normalized: 'add' },
266
+ { native: 'ازل', normalized: 'remove' },
267
+ { native: 'اضع', normalized: 'put' },
268
+ { native: 'يضع', normalized: 'put' },
269
+ { native: 'اجعل', normalized: 'put' },
270
+ { native: 'عين', normalized: 'set' },
271
+ { native: 'زد', normalized: 'increment' },
272
+ { native: 'ارفع', normalized: 'increment' },
273
+ { native: 'انقص', normalized: 'decrement' },
274
+ { native: 'قلل', normalized: 'decrement' },
275
+ { native: 'سجل', normalized: 'log' },
276
+ { native: 'اظهر', normalized: 'show' },
277
+ { native: 'اعرض', normalized: 'show' },
278
+ { native: 'اخف', normalized: 'hide' },
279
+ { native: 'اخفي', normalized: 'hide' },
280
+ { native: 'شغل', normalized: 'trigger' },
281
+ { native: 'ارسل', normalized: 'send' },
282
+ { native: 'ركز', normalized: 'focus' },
283
+ { native: 'شوش', normalized: 'blur' },
284
+ { native: 'اذا', normalized: 'if' },
285
+ { native: 'لو', normalized: 'if' },
286
+ { native: 'والا', normalized: 'else' },
287
+ { native: 'توقف', normalized: 'halt' },
288
+ { native: 'انسخ', normalized: 'clone' },
289
+
290
+ // Control flow helpers
291
+ { native: 'إذن', normalized: 'then' },
292
+ { native: 'فإن', normalized: 'then' },
293
+ { native: 'نهاية', normalized: 'end' },
294
+
295
+ // Modifiers
296
+ { native: 'قبل', normalized: 'before' },
297
+ { native: 'بعد', normalized: 'after' },
298
+ ];
299
+
300
+ // =============================================================================
301
+ // Arabic Time Units
302
+ // =============================================================================
303
+
304
+ /**
305
+ * Arabic time unit patterns for number parsing.
306
+ * Sorted by length (longest first) to ensure correct matching.
307
+ * Arabic allows space between number and unit (ملي ثانية = millisecond).
308
+ */
309
+ const ARABIC_TIME_UNITS: readonly TimeUnitMapping[] = [
310
+ { pattern: 'ملي ثانية', suffix: 'ms', length: 9, caseInsensitive: false },
311
+ { pattern: 'ملي_ثانية', suffix: 'ms', length: 8, caseInsensitive: false },
312
+ { pattern: 'دقائق', suffix: 'm', length: 5, caseInsensitive: false },
313
+ { pattern: 'دقيقة', suffix: 'm', length: 5, caseInsensitive: false },
314
+ { pattern: 'ثواني', suffix: 's', length: 5, caseInsensitive: false },
315
+ { pattern: 'ثانية', suffix: 's', length: 5, caseInsensitive: false },
316
+ { pattern: 'ساعات', suffix: 'h', length: 5, caseInsensitive: false },
317
+ { pattern: 'ساعة', suffix: 'h', length: 4, caseInsensitive: false },
318
+ ];
319
+
320
+ // =============================================================================
321
+ // Arabic Tokenizer Implementation
322
+ // =============================================================================
323
+
324
+ export class ArabicTokenizer extends BaseTokenizer {
325
+ readonly language = 'ar';
326
+ readonly direction = 'rtl' as const;
327
+
328
+ constructor() {
329
+ super();
330
+ this.initializeKeywordsFromProfile(arabicProfile, ARABIC_EXTRAS);
331
+ // Set morphological normalizer for prefix/suffix stripping
332
+ this.normalizer = new ArabicMorphologicalNormalizer();
333
+ }
334
+
335
+ tokenize(input: string): TokenStream {
336
+ const tokens: LanguageToken[] = [];
337
+ let pos = 0;
338
+
339
+ while (pos < input.length) {
340
+ // Skip whitespace
341
+ if (isWhitespace(input[pos])) {
342
+ pos++;
343
+ continue;
344
+ }
345
+
346
+ // Try CSS selector first (LTR island in RTL text)
347
+ if (isSelectorStart(input[pos])) {
348
+ // Check for event modifier first (.once, .debounce(), etc.)
349
+ const modifierToken = this.tryEventModifier(input, pos);
350
+ if (modifierToken) {
351
+ tokens.push(modifierToken);
352
+ pos = modifierToken.position.end;
353
+ continue;
354
+ }
355
+
356
+ const selectorToken = this.trySelector(input, pos);
357
+ if (selectorToken) {
358
+ tokens.push(selectorToken);
359
+ pos = selectorToken.position.end;
360
+ continue;
361
+ }
362
+ }
363
+
364
+ // Try string literal
365
+ if (isQuote(input[pos])) {
366
+ const stringToken = this.tryString(input, pos);
367
+ if (stringToken) {
368
+ tokens.push(stringToken);
369
+ pos = stringToken.position.end;
370
+ continue;
371
+ }
372
+ }
373
+
374
+ // Try URL (/path, ./path, http://, etc.)
375
+ if (isUrlStart(input, pos)) {
376
+ const urlToken = this.tryUrl(input, pos);
377
+ if (urlToken) {
378
+ tokens.push(urlToken);
379
+ pos = urlToken.position.end;
380
+ continue;
381
+ }
382
+ }
383
+
384
+ // Try number
385
+ if (isDigit(input[pos])) {
386
+ const numberToken = this.extractArabicNumber(input, pos);
387
+ if (numberToken) {
388
+ tokens.push(numberToken);
389
+ pos = numberToken.position.end;
390
+ continue;
391
+ }
392
+ }
393
+
394
+ // Try variable reference (:varname)
395
+ const varToken = this.tryVariableRef(input, pos);
396
+ if (varToken) {
397
+ tokens.push(varToken);
398
+ pos = varToken.position.end;
399
+ continue;
400
+ }
401
+
402
+ // Try Arabic preposition (multi-word first)
403
+ const prepToken = this.tryPreposition(input, pos);
404
+ if (prepToken) {
405
+ tokens.push(prepToken);
406
+ pos = prepToken.position.end;
407
+ continue;
408
+ }
409
+
410
+ // Try Arabic word (with proclitic detection)
411
+ if (isArabic(input[pos])) {
412
+ // Check for proclitic conjunction (و or ف) attached to following word
413
+ const procliticResult = this.tryProclitic(input, pos);
414
+ if (procliticResult) {
415
+ tokens.push(procliticResult.conjunction);
416
+ pos = procliticResult.conjunction.position.end;
417
+ // Continue to let the next iteration extract the remaining word
418
+ continue;
419
+ }
420
+
421
+ const wordToken = this.extractArabicWord(input, pos);
422
+ if (wordToken) {
423
+ tokens.push(wordToken);
424
+ pos = wordToken.position.end;
425
+ continue;
426
+ }
427
+ }
428
+
429
+ // Try ASCII word (for mixed content)
430
+ if (isAsciiIdentifierChar(input[pos])) {
431
+ const asciiToken = this.extractAsciiWord(input, pos);
432
+ if (asciiToken) {
433
+ tokens.push(asciiToken);
434
+ pos = asciiToken.position.end;
435
+ continue;
436
+ }
437
+ }
438
+
439
+ // Skip unknown character
440
+ pos++;
441
+ }
442
+
443
+ return new TokenStreamImpl(tokens, 'ar');
444
+ }
445
+
446
+ classifyToken(token: string): TokenKind {
447
+ if (PREPOSITIONS.has(token)) return 'particle';
448
+ // O(1) Map lookup instead of O(n) array search
449
+ if (this.isKeyword(token)) return 'keyword';
450
+ if (token.startsWith('#') || token.startsWith('.') || token.startsWith('[')) return 'selector';
451
+ if (token.startsWith('"') || token.startsWith("'")) return 'literal';
452
+ if (/^\d/.test(token)) return 'literal';
453
+
454
+ return 'identifier';
455
+ }
456
+
457
+ /**
458
+ * Try to match an Arabic preposition.
459
+ * Attaches prepositionValue metadata for disambiguation in pattern matching.
460
+ */
461
+ private tryPreposition(input: string, pos: number): LanguageToken | null {
462
+ // Check prepositions from longest to shortest
463
+ const sortedPreps = Array.from(PREPOSITIONS).sort((a, b) => b.length - a.length);
464
+
465
+ for (const prep of sortedPreps) {
466
+ if (input.slice(pos, pos + prep.length) === prep) {
467
+ // Check that it's a standalone word (followed by space or non-Arabic)
468
+ const nextPos = pos + prep.length;
469
+ if (nextPos >= input.length || isWhitespace(input[nextPos]) || !isArabic(input[nextPos])) {
470
+ const token = createToken(prep, 'particle', createPosition(pos, nextPos));
471
+ // Attach metadata for preposition disambiguation
472
+ return {
473
+ ...token,
474
+ metadata: {
475
+ prepositionValue: prep,
476
+ },
477
+ };
478
+ }
479
+ }
480
+ }
481
+ return null;
482
+ }
483
+
484
+ /**
485
+ * Try to extract a proclitic (conjunction or preposition) that's attached to the following word.
486
+ *
487
+ * Arabic proclitics attach directly to words without space:
488
+ * - والنقر → و + النقر (and + the-click)
489
+ * - فالتبديل → ف + التبديل (then + the-toggle)
490
+ * - بالنقر → ب + النقر (with + the-click)
491
+ * - ولالنقر → و + ل + النقر (and + to + the-click)
492
+ *
493
+ * This enables:
494
+ * - Polysyndetic coordination: A وB وC
495
+ * - Attached prepositions: بالنقر (with-the-click)
496
+ * - Multi-proclitic sequences: ولالنقر (and-to-the-click)
497
+ *
498
+ * Returns null if:
499
+ * - Not a proclitic character/sequence
500
+ * - Proclitic is standalone (followed by space)
501
+ * - Remaining word is too short (< 2 chars, to avoid false positives)
502
+ * - Full word is a recognized keyword (e.g., بدل should NOT be split to ب + دل)
503
+ *
504
+ * @see NATIVE_REVIEW_NEEDED.md for implementation rationale
505
+ */
506
+ private tryProclitic(input: string, pos: number): { conjunction: LanguageToken } | null {
507
+ // CRITICAL: Check if the full word is a keyword BEFORE splitting
508
+ // This prevents keywords like بدل (toggle) from being split into ب (with) + دل
509
+ let wordEnd = pos;
510
+ while (wordEnd < input.length && (isArabic(input[wordEnd]) || input[wordEnd] === 'ـ')) {
511
+ wordEnd++;
512
+ }
513
+ const fullWord = input.slice(pos, wordEnd);
514
+
515
+ // Check if full word is a keyword (with or without diacritics)
516
+ if (this.lookupKeyword(fullWord)) {
517
+ return null; // Let extractArabicWord handle it
518
+ }
519
+
520
+ // Check temporal markers (they also shouldn't be split)
521
+ if (TEMPORAL_MARKERS.has(fullWord)) {
522
+ return null;
523
+ }
524
+
525
+ // Check prepositions (they also shouldn't be split)
526
+ if (PREPOSITIONS.has(fullWord)) {
527
+ return null;
528
+ }
529
+ // Try multi-character proclitics first (longest match)
530
+ // Check 2-character sequences (ول, وب, فل, فب, etc.)
531
+ if (pos + 2 <= input.length) {
532
+ const twoChar = input.slice(pos, pos + 2);
533
+ const twoCharEntry = PROCLITICS.get(twoChar);
534
+ if (twoCharEntry) {
535
+ // Check if there's a following Arabic character (proclitic must be attached)
536
+ const nextPos = pos + 2;
537
+ if (nextPos < input.length && isArabic(input[nextPos])) {
538
+ // Count remaining Arabic characters to ensure meaningful word follows
539
+ let remainingLength = 0;
540
+ let checkPos = nextPos;
541
+ while (checkPos < input.length && isArabic(input[checkPos])) {
542
+ remainingLength++;
543
+ checkPos++;
544
+ }
545
+
546
+ // Require at least 2 characters after proclitic to avoid false positives
547
+ if (remainingLength >= 2) {
548
+ // IMPORTANT: Check if a single-char proclitic would leave a keyword
549
+ // e.g., "وبدل" should be "و" + "بدل" (keyword), not "وب" + "دل"
550
+ const singleCharProclitic = PROCLITICS.get(input[pos]);
551
+ if (singleCharProclitic) {
552
+ const afterSingleChar = input.slice(pos + 1, wordEnd);
553
+ if (this.lookupKeyword(afterSingleChar)) {
554
+ // Single-char proclitic leaves a keyword - don't match multi-proclitic
555
+ // Fall through to single-char proclitic handling below
556
+ } else {
557
+ // Multi-char proclitic is valid
558
+ const tokenKind =
559
+ twoCharEntry.type === 'conjunction'
560
+ ? ('conjunction' as const)
561
+ : ('particle' as const);
562
+ return {
563
+ conjunction: createToken(
564
+ twoChar,
565
+ tokenKind,
566
+ createPosition(pos, nextPos),
567
+ twoCharEntry.normalized
568
+ ),
569
+ };
570
+ }
571
+ } else {
572
+ // No single-char proclitic alternative, use multi-char
573
+ const tokenKind =
574
+ twoCharEntry.type === 'conjunction'
575
+ ? ('conjunction' as const)
576
+ : ('particle' as const);
577
+ return {
578
+ conjunction: createToken(
579
+ twoChar,
580
+ tokenKind,
581
+ createPosition(pos, nextPos),
582
+ twoCharEntry.normalized
583
+ ),
584
+ };
585
+ }
586
+ }
587
+ }
588
+ }
589
+ }
590
+
591
+ // Try single-character proclitics
592
+ const char = input[pos];
593
+ const entry = PROCLITICS.get(char);
594
+
595
+ if (!entry) return null;
596
+
597
+ // Check if there's a following Arabic character (proclitic must be attached)
598
+ const nextPos = pos + 1;
599
+ if (nextPos >= input.length || !isArabic(input[nextPos])) {
600
+ return null; // Standalone conjunction or end of input
601
+ }
602
+
603
+ // Count remaining Arabic characters to ensure meaningful word follows
604
+ let remainingLength = 0;
605
+ let checkPos = nextPos;
606
+ while (checkPos < input.length && isArabic(input[checkPos])) {
607
+ remainingLength++;
608
+ checkPos++;
609
+ }
610
+
611
+ // Require at least 2 characters after proclitic to avoid false positives
612
+ // (e.g., وو could be a typo, and short roots need protection)
613
+ if (remainingLength < 2) {
614
+ return null;
615
+ }
616
+
617
+ const tokenKind =
618
+ entry.type === 'conjunction' ? ('conjunction' as const) : ('particle' as const);
619
+ return {
620
+ conjunction: createToken(char, tokenKind, createPosition(pos, nextPos), entry.normalized),
621
+ };
622
+ }
623
+
624
+ /**
625
+ * Extract an Arabic word.
626
+ * Uses morphological normalization to handle prefix/suffix variations.
627
+ * Attaches metadata for temporal markers (formality, confidence).
628
+ */
629
+ private extractArabicWord(input: string, startPos: number): LanguageToken | null {
630
+ let pos = startPos;
631
+ let word = '';
632
+
633
+ // Check for attached prefix
634
+ for (const prefix of ATTACHED_PREFIXES) {
635
+ const basePrefix = prefix.replace('ـ', '');
636
+ if (input.slice(pos, pos + basePrefix.length) === basePrefix) {
637
+ // This is a prefix - extract it separately
638
+ // For now, include it in the word
639
+ }
640
+ }
641
+
642
+ // Extract Arabic characters
643
+ while (pos < input.length && (isArabic(input[pos]) || input[pos] === 'ـ')) {
644
+ word += input[pos++];
645
+ }
646
+
647
+ if (!word) return null;
648
+
649
+ // Check if it's a temporal marker (with formality metadata)
650
+ const temporalMarker = TEMPORAL_MARKERS.get(word);
651
+ if (temporalMarker) {
652
+ const token = createToken(
653
+ word,
654
+ 'keyword',
655
+ createPosition(startPos, pos),
656
+ temporalMarker.normalized
657
+ );
658
+ return {
659
+ ...token,
660
+ metadata: {
661
+ temporalFormality: temporalMarker.formality,
662
+ temporalConfidence: temporalMarker.confidence,
663
+ },
664
+ };
665
+ }
666
+
667
+ // O(1) Map lookup instead of O(n) array search
668
+ const keywordEntry = this.lookupKeyword(word);
669
+ if (keywordEntry) {
670
+ return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
671
+ }
672
+
673
+ // Check if it's a preposition (with metadata for disambiguation)
674
+ if (PREPOSITIONS.has(word)) {
675
+ const token = createToken(word, 'particle', createPosition(startPos, pos));
676
+ return {
677
+ ...token,
678
+ metadata: {
679
+ prepositionValue: word,
680
+ },
681
+ };
682
+ }
683
+
684
+ // Try morphological normalization for conjugated/inflected forms
685
+ const morphToken = this.tryMorphKeywordMatch(word, startPos, pos);
686
+ if (morphToken) return morphToken;
687
+
688
+ // Not a keyword or recognized form, return as identifier
689
+ return createToken(word, 'identifier', createPosition(startPos, pos));
690
+ }
691
+
692
+ /**
693
+ * Extract an ASCII word.
694
+ */
695
+ private extractAsciiWord(input: string, startPos: number): LanguageToken | null {
696
+ let pos = startPos;
697
+ let word = '';
698
+
699
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
700
+ word += input[pos++];
701
+ }
702
+
703
+ if (!word) return null;
704
+
705
+ return createToken(word, 'identifier', createPosition(startPos, pos));
706
+ }
707
+
708
+ /**
709
+ * Extract a number, including Arabic time unit suffixes.
710
+ * Arabic allows space between number and unit.
711
+ */
712
+ private extractArabicNumber(input: string, startPos: number): LanguageToken | null {
713
+ return this.tryNumberWithTimeUnits(input, startPos, ARABIC_TIME_UNITS, {
714
+ allowSign: false,
715
+ skipWhitespace: true,
716
+ });
717
+ }
718
+ }
719
+
720
+ /**
721
+ * Singleton instance.
722
+ */
723
+ export const arabicTokenizer = new ArabicTokenizer();