@lokascript/semantic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +686 -0
  3. package/dist/browser-ar.ar.global.js +2 -0
  4. package/dist/browser-core.core.global.js +2 -0
  5. package/dist/browser-de.de.global.js +2 -0
  6. package/dist/browser-east-asian.east-asian.global.js +2 -0
  7. package/dist/browser-en-tr.en-tr.global.js +2 -0
  8. package/dist/browser-en.en.global.js +2 -0
  9. package/dist/browser-es-en.es-en.global.js +2 -0
  10. package/dist/browser-es.es.global.js +2 -0
  11. package/dist/browser-fr.fr.global.js +2 -0
  12. package/dist/browser-id.id.global.js +2 -0
  13. package/dist/browser-ja.ja.global.js +2 -0
  14. package/dist/browser-ko.ko.global.js +2 -0
  15. package/dist/browser-lazy.lazy.global.js +2 -0
  16. package/dist/browser-priority.priority.global.js +2 -0
  17. package/dist/browser-pt.pt.global.js +2 -0
  18. package/dist/browser-qu.qu.global.js +2 -0
  19. package/dist/browser-sw.sw.global.js +2 -0
  20. package/dist/browser-tr.tr.global.js +2 -0
  21. package/dist/browser-western.western.global.js +2 -0
  22. package/dist/browser-zh.zh.global.js +2 -0
  23. package/dist/browser.global.js +3 -0
  24. package/dist/browser.global.js.map +1 -0
  25. package/dist/index.cjs +35051 -0
  26. package/dist/index.cjs.map +1 -0
  27. package/dist/index.d.cts +3426 -0
  28. package/dist/index.d.ts +3426 -0
  29. package/dist/index.js +34890 -0
  30. package/dist/index.js.map +1 -0
  31. package/dist/languages/ar.d.ts +78 -0
  32. package/dist/languages/ar.js +1622 -0
  33. package/dist/languages/ar.js.map +1 -0
  34. package/dist/languages/de.d.ts +38 -0
  35. package/dist/languages/de.js +1168 -0
  36. package/dist/languages/de.js.map +1 -0
  37. package/dist/languages/en.d.ts +44 -0
  38. package/dist/languages/en.js +3491 -0
  39. package/dist/languages/en.js.map +1 -0
  40. package/dist/languages/es.d.ts +52 -0
  41. package/dist/languages/es.js +1493 -0
  42. package/dist/languages/es.js.map +1 -0
  43. package/dist/languages/fr.d.ts +37 -0
  44. package/dist/languages/fr.js +1159 -0
  45. package/dist/languages/fr.js.map +1 -0
  46. package/dist/languages/id.d.ts +35 -0
  47. package/dist/languages/id.js +1152 -0
  48. package/dist/languages/id.js.map +1 -0
  49. package/dist/languages/ja.d.ts +53 -0
  50. package/dist/languages/ja.js +1430 -0
  51. package/dist/languages/ja.js.map +1 -0
  52. package/dist/languages/ko.d.ts +51 -0
  53. package/dist/languages/ko.js +1729 -0
  54. package/dist/languages/ko.js.map +1 -0
  55. package/dist/languages/pt.d.ts +37 -0
  56. package/dist/languages/pt.js +1127 -0
  57. package/dist/languages/pt.js.map +1 -0
  58. package/dist/languages/qu.d.ts +36 -0
  59. package/dist/languages/qu.js +1143 -0
  60. package/dist/languages/qu.js.map +1 -0
  61. package/dist/languages/sw.d.ts +35 -0
  62. package/dist/languages/sw.js +1147 -0
  63. package/dist/languages/sw.js.map +1 -0
  64. package/dist/languages/tr.d.ts +45 -0
  65. package/dist/languages/tr.js +1529 -0
  66. package/dist/languages/tr.js.map +1 -0
  67. package/dist/languages/zh.d.ts +58 -0
  68. package/dist/languages/zh.js +1257 -0
  69. package/dist/languages/zh.js.map +1 -0
  70. package/dist/types-C4dcj53L.d.ts +600 -0
  71. package/package.json +202 -0
  72. package/src/__test-utils__/index.ts +7 -0
  73. package/src/__test-utils__/test-helpers.ts +8 -0
  74. package/src/__types__/test-helpers.ts +122 -0
  75. package/src/analysis/index.ts +479 -0
  76. package/src/ast-builder/command-mappers.ts +1133 -0
  77. package/src/ast-builder/expression-parser/index.ts +41 -0
  78. package/src/ast-builder/expression-parser/parser.ts +563 -0
  79. package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
  80. package/src/ast-builder/expression-parser/types.ts +208 -0
  81. package/src/ast-builder/index.ts +536 -0
  82. package/src/ast-builder/value-converters.ts +172 -0
  83. package/src/bridge.ts +275 -0
  84. package/src/browser-ar.ts +162 -0
  85. package/src/browser-core.ts +231 -0
  86. package/src/browser-de.ts +162 -0
  87. package/src/browser-east-asian.ts +173 -0
  88. package/src/browser-en-tr.ts +165 -0
  89. package/src/browser-en.ts +157 -0
  90. package/src/browser-es-en.ts +200 -0
  91. package/src/browser-es.ts +170 -0
  92. package/src/browser-fr.ts +162 -0
  93. package/src/browser-id.ts +162 -0
  94. package/src/browser-ja.ts +162 -0
  95. package/src/browser-ko.ts +162 -0
  96. package/src/browser-lazy.ts +189 -0
  97. package/src/browser-priority.ts +214 -0
  98. package/src/browser-pt.ts +162 -0
  99. package/src/browser-qu.ts +162 -0
  100. package/src/browser-sw.ts +162 -0
  101. package/src/browser-tr.ts +162 -0
  102. package/src/browser-western.ts +181 -0
  103. package/src/browser-zh.ts +162 -0
  104. package/src/browser.ts +268 -0
  105. package/src/cache/index.ts +14 -0
  106. package/src/cache/semantic-cache.ts +344 -0
  107. package/src/core-bridge.ts +372 -0
  108. package/src/explicit/converter.ts +258 -0
  109. package/src/explicit/index.ts +18 -0
  110. package/src/explicit/parser.ts +236 -0
  111. package/src/explicit/renderer.ts +424 -0
  112. package/src/generators/command-schemas.ts +1636 -0
  113. package/src/generators/event-handler-generator.ts +109 -0
  114. package/src/generators/index.ts +117 -0
  115. package/src/generators/language-profiles.ts +139 -0
  116. package/src/generators/pattern-generator.ts +537 -0
  117. package/src/generators/profiles/arabic.ts +131 -0
  118. package/src/generators/profiles/bengali.ts +132 -0
  119. package/src/generators/profiles/chinese.ts +124 -0
  120. package/src/generators/profiles/english.ts +113 -0
  121. package/src/generators/profiles/french.ts +125 -0
  122. package/src/generators/profiles/german.ts +126 -0
  123. package/src/generators/profiles/hindi.ts +146 -0
  124. package/src/generators/profiles/index.ts +46 -0
  125. package/src/generators/profiles/indonesian.ts +125 -0
  126. package/src/generators/profiles/italian.ts +139 -0
  127. package/src/generators/profiles/japanese.ts +149 -0
  128. package/src/generators/profiles/korean.ts +127 -0
  129. package/src/generators/profiles/marker-templates.ts +288 -0
  130. package/src/generators/profiles/ms.ts +130 -0
  131. package/src/generators/profiles/polish.ts +249 -0
  132. package/src/generators/profiles/portuguese.ts +115 -0
  133. package/src/generators/profiles/quechua.ts +113 -0
  134. package/src/generators/profiles/russian.ts +260 -0
  135. package/src/generators/profiles/spanish.ts +130 -0
  136. package/src/generators/profiles/swahili.ts +129 -0
  137. package/src/generators/profiles/thai.ts +132 -0
  138. package/src/generators/profiles/tl.ts +128 -0
  139. package/src/generators/profiles/turkish.ts +124 -0
  140. package/src/generators/profiles/types.ts +165 -0
  141. package/src/generators/profiles/ukrainian.ts +270 -0
  142. package/src/generators/profiles/vietnamese.ts +133 -0
  143. package/src/generators/schema-error-codes.ts +160 -0
  144. package/src/generators/schema-validator.ts +391 -0
  145. package/src/index.ts +429 -0
  146. package/src/language-building-schema.ts +3170 -0
  147. package/src/language-loader.ts +394 -0
  148. package/src/languages/_all.ts +65 -0
  149. package/src/languages/ar.ts +15 -0
  150. package/src/languages/bn.ts +16 -0
  151. package/src/languages/de.ts +15 -0
  152. package/src/languages/en.ts +29 -0
  153. package/src/languages/es.ts +15 -0
  154. package/src/languages/fr.ts +15 -0
  155. package/src/languages/hi.ts +26 -0
  156. package/src/languages/id.ts +15 -0
  157. package/src/languages/index.ts +18 -0
  158. package/src/languages/it.ts +15 -0
  159. package/src/languages/ja.ts +15 -0
  160. package/src/languages/ko.ts +15 -0
  161. package/src/languages/ms.ts +16 -0
  162. package/src/languages/pl.ts +18 -0
  163. package/src/languages/pt.ts +15 -0
  164. package/src/languages/qu.ts +15 -0
  165. package/src/languages/ru.ts +26 -0
  166. package/src/languages/sw.ts +15 -0
  167. package/src/languages/th.ts +16 -0
  168. package/src/languages/tl.ts +16 -0
  169. package/src/languages/tr.ts +15 -0
  170. package/src/languages/uk.ts +26 -0
  171. package/src/languages/vi.ts +16 -0
  172. package/src/languages/zh.ts +15 -0
  173. package/src/parser/index.ts +15 -0
  174. package/src/parser/pattern-matcher.ts +1181 -0
  175. package/src/parser/semantic-parser.ts +573 -0
  176. package/src/parser/utils/index.ts +35 -0
  177. package/src/parser/utils/marker-resolution.ts +111 -0
  178. package/src/parser/utils/possessive-keywords.ts +43 -0
  179. package/src/parser/utils/role-positioning.ts +70 -0
  180. package/src/parser/utils/type-validation.ts +134 -0
  181. package/src/patterns/add/ar.ts +71 -0
  182. package/src/patterns/add/bn.ts +70 -0
  183. package/src/patterns/add/hi.ts +69 -0
  184. package/src/patterns/add/index.ts +87 -0
  185. package/src/patterns/add/it.ts +61 -0
  186. package/src/patterns/add/ja.ts +93 -0
  187. package/src/patterns/add/ko.ts +74 -0
  188. package/src/patterns/add/ms.ts +30 -0
  189. package/src/patterns/add/pl.ts +62 -0
  190. package/src/patterns/add/ru.ts +62 -0
  191. package/src/patterns/add/th.ts +49 -0
  192. package/src/patterns/add/tl.ts +30 -0
  193. package/src/patterns/add/tr.ts +71 -0
  194. package/src/patterns/add/uk.ts +62 -0
  195. package/src/patterns/add/vi.ts +61 -0
  196. package/src/patterns/add/zh.ts +71 -0
  197. package/src/patterns/builders.ts +207 -0
  198. package/src/patterns/decrement/bn.ts +70 -0
  199. package/src/patterns/decrement/de.ts +42 -0
  200. package/src/patterns/decrement/hi.ts +68 -0
  201. package/src/patterns/decrement/index.ts +79 -0
  202. package/src/patterns/decrement/it.ts +69 -0
  203. package/src/patterns/decrement/ms.ts +30 -0
  204. package/src/patterns/decrement/pl.ts +58 -0
  205. package/src/patterns/decrement/ru.ts +58 -0
  206. package/src/patterns/decrement/th.ts +49 -0
  207. package/src/patterns/decrement/tl.ts +30 -0
  208. package/src/patterns/decrement/tr.ts +48 -0
  209. package/src/patterns/decrement/uk.ts +58 -0
  210. package/src/patterns/decrement/vi.ts +61 -0
  211. package/src/patterns/decrement/zh.ts +32 -0
  212. package/src/patterns/en.ts +302 -0
  213. package/src/patterns/event-handler/ar.ts +151 -0
  214. package/src/patterns/event-handler/bn.ts +72 -0
  215. package/src/patterns/event-handler/de.ts +117 -0
  216. package/src/patterns/event-handler/en.ts +117 -0
  217. package/src/patterns/event-handler/es.ts +136 -0
  218. package/src/patterns/event-handler/fr.ts +117 -0
  219. package/src/patterns/event-handler/hi.ts +64 -0
  220. package/src/patterns/event-handler/id.ts +117 -0
  221. package/src/patterns/event-handler/index.ts +119 -0
  222. package/src/patterns/event-handler/it.ts +54 -0
  223. package/src/patterns/event-handler/ja.ts +118 -0
  224. package/src/patterns/event-handler/ko.ts +133 -0
  225. package/src/patterns/event-handler/ms.ts +30 -0
  226. package/src/patterns/event-handler/pl.ts +62 -0
  227. package/src/patterns/event-handler/pt.ts +117 -0
  228. package/src/patterns/event-handler/qu.ts +66 -0
  229. package/src/patterns/event-handler/ru.ts +62 -0
  230. package/src/patterns/event-handler/shared.ts +270 -0
  231. package/src/patterns/event-handler/sw.ts +117 -0
  232. package/src/patterns/event-handler/th.ts +53 -0
  233. package/src/patterns/event-handler/tl.ts +30 -0
  234. package/src/patterns/event-handler/tr.ts +170 -0
  235. package/src/patterns/event-handler/uk.ts +62 -0
  236. package/src/patterns/event-handler/vi.ts +61 -0
  237. package/src/patterns/event-handler/zh.ts +150 -0
  238. package/src/patterns/get/ar.ts +49 -0
  239. package/src/patterns/get/bn.ts +47 -0
  240. package/src/patterns/get/de.ts +32 -0
  241. package/src/patterns/get/hi.ts +52 -0
  242. package/src/patterns/get/index.ts +83 -0
  243. package/src/patterns/get/it.ts +56 -0
  244. package/src/patterns/get/ja.ts +53 -0
  245. package/src/patterns/get/ko.ts +53 -0
  246. package/src/patterns/get/ms.ts +30 -0
  247. package/src/patterns/get/pl.ts +57 -0
  248. package/src/patterns/get/ru.ts +57 -0
  249. package/src/patterns/get/th.ts +29 -0
  250. package/src/patterns/get/tl.ts +30 -0
  251. package/src/patterns/get/uk.ts +57 -0
  252. package/src/patterns/get/vi.ts +48 -0
  253. package/src/patterns/grammar-transformed/index.ts +39 -0
  254. package/src/patterns/grammar-transformed/ja.ts +1713 -0
  255. package/src/patterns/grammar-transformed/ko.ts +1311 -0
  256. package/src/patterns/grammar-transformed/tr.ts +1067 -0
  257. package/src/patterns/hide/ar.ts +67 -0
  258. package/src/patterns/hide/bn.ts +47 -0
  259. package/src/patterns/hide/de.ts +36 -0
  260. package/src/patterns/hide/hi.ts +61 -0
  261. package/src/patterns/hide/index.ts +91 -0
  262. package/src/patterns/hide/it.ts +56 -0
  263. package/src/patterns/hide/ja.ts +69 -0
  264. package/src/patterns/hide/ko.ts +69 -0
  265. package/src/patterns/hide/ms.ts +30 -0
  266. package/src/patterns/hide/pl.ts +57 -0
  267. package/src/patterns/hide/ru.ts +57 -0
  268. package/src/patterns/hide/th.ts +29 -0
  269. package/src/patterns/hide/tl.ts +30 -0
  270. package/src/patterns/hide/tr.ts +65 -0
  271. package/src/patterns/hide/uk.ts +57 -0
  272. package/src/patterns/hide/vi.ts +56 -0
  273. package/src/patterns/hide/zh.ts +68 -0
  274. package/src/patterns/increment/bn.ts +70 -0
  275. package/src/patterns/increment/de.ts +36 -0
  276. package/src/patterns/increment/hi.ts +68 -0
  277. package/src/patterns/increment/index.ts +79 -0
  278. package/src/patterns/increment/it.ts +69 -0
  279. package/src/patterns/increment/ms.ts +30 -0
  280. package/src/patterns/increment/pl.ts +58 -0
  281. package/src/patterns/increment/ru.ts +58 -0
  282. package/src/patterns/increment/th.ts +49 -0
  283. package/src/patterns/increment/tl.ts +30 -0
  284. package/src/patterns/increment/tr.ts +52 -0
  285. package/src/patterns/increment/uk.ts +58 -0
  286. package/src/patterns/increment/vi.ts +61 -0
  287. package/src/patterns/increment/zh.ts +32 -0
  288. package/src/patterns/index.ts +84 -0
  289. package/src/patterns/languages/en/control-flow.ts +93 -0
  290. package/src/patterns/languages/en/fetch.ts +62 -0
  291. package/src/patterns/languages/en/index.ts +42 -0
  292. package/src/patterns/languages/en/repeat.ts +67 -0
  293. package/src/patterns/languages/en/set.ts +48 -0
  294. package/src/patterns/languages/en/swap.ts +38 -0
  295. package/src/patterns/languages/en/temporal.ts +57 -0
  296. package/src/patterns/put/ar.ts +74 -0
  297. package/src/patterns/put/bn.ts +53 -0
  298. package/src/patterns/put/en.ts +74 -0
  299. package/src/patterns/put/es.ts +74 -0
  300. package/src/patterns/put/hi.ts +69 -0
  301. package/src/patterns/put/id.ts +96 -0
  302. package/src/patterns/put/index.ts +99 -0
  303. package/src/patterns/put/it.ts +56 -0
  304. package/src/patterns/put/ja.ts +75 -0
  305. package/src/patterns/put/ko.ts +67 -0
  306. package/src/patterns/put/ms.ts +30 -0
  307. package/src/patterns/put/pl.ts +81 -0
  308. package/src/patterns/put/ru.ts +85 -0
  309. package/src/patterns/put/th.ts +32 -0
  310. package/src/patterns/put/tl.ts +30 -0
  311. package/src/patterns/put/tr.ts +67 -0
  312. package/src/patterns/put/uk.ts +85 -0
  313. package/src/patterns/put/vi.ts +72 -0
  314. package/src/patterns/put/zh.ts +62 -0
  315. package/src/patterns/registry.ts +163 -0
  316. package/src/patterns/remove/ar.ts +71 -0
  317. package/src/patterns/remove/bn.ts +68 -0
  318. package/src/patterns/remove/hi.ts +69 -0
  319. package/src/patterns/remove/index.ts +87 -0
  320. package/src/patterns/remove/it.ts +69 -0
  321. package/src/patterns/remove/ja.ts +74 -0
  322. package/src/patterns/remove/ko.ts +78 -0
  323. package/src/patterns/remove/ms.ts +30 -0
  324. package/src/patterns/remove/pl.ts +62 -0
  325. package/src/patterns/remove/ru.ts +62 -0
  326. package/src/patterns/remove/th.ts +49 -0
  327. package/src/patterns/remove/tl.ts +30 -0
  328. package/src/patterns/remove/tr.ts +78 -0
  329. package/src/patterns/remove/uk.ts +62 -0
  330. package/src/patterns/remove/vi.ts +61 -0
  331. package/src/patterns/remove/zh.ts +72 -0
  332. package/src/patterns/set/ar.ts +84 -0
  333. package/src/patterns/set/bn.ts +53 -0
  334. package/src/patterns/set/de.ts +84 -0
  335. package/src/patterns/set/es.ts +92 -0
  336. package/src/patterns/set/fr.ts +88 -0
  337. package/src/patterns/set/hi.ts +56 -0
  338. package/src/patterns/set/id.ts +84 -0
  339. package/src/patterns/set/index.ts +107 -0
  340. package/src/patterns/set/it.ts +56 -0
  341. package/src/patterns/set/ja.ts +86 -0
  342. package/src/patterns/set/ko.ts +85 -0
  343. package/src/patterns/set/ms.ts +30 -0
  344. package/src/patterns/set/pl.ts +57 -0
  345. package/src/patterns/set/pt.ts +84 -0
  346. package/src/patterns/set/ru.ts +57 -0
  347. package/src/patterns/set/th.ts +31 -0
  348. package/src/patterns/set/tl.ts +30 -0
  349. package/src/patterns/set/tr.ts +107 -0
  350. package/src/patterns/set/uk.ts +57 -0
  351. package/src/patterns/set/vi.ts +53 -0
  352. package/src/patterns/set/zh.ts +84 -0
  353. package/src/patterns/show/ar.ts +67 -0
  354. package/src/patterns/show/bn.ts +47 -0
  355. package/src/patterns/show/de.ts +32 -0
  356. package/src/patterns/show/fr.ts +32 -0
  357. package/src/patterns/show/hi.ts +61 -0
  358. package/src/patterns/show/index.ts +95 -0
  359. package/src/patterns/show/it.ts +56 -0
  360. package/src/patterns/show/ja.ts +69 -0
  361. package/src/patterns/show/ko.ts +73 -0
  362. package/src/patterns/show/ms.ts +30 -0
  363. package/src/patterns/show/pl.ts +57 -0
  364. package/src/patterns/show/ru.ts +57 -0
  365. package/src/patterns/show/th.ts +29 -0
  366. package/src/patterns/show/tl.ts +30 -0
  367. package/src/patterns/show/tr.ts +65 -0
  368. package/src/patterns/show/uk.ts +57 -0
  369. package/src/patterns/show/vi.ts +56 -0
  370. package/src/patterns/show/zh.ts +68 -0
  371. package/src/patterns/take/ar.ts +51 -0
  372. package/src/patterns/take/index.ts +31 -0
  373. package/src/patterns/toggle/ar.ts +61 -0
  374. package/src/patterns/toggle/bn.ts +70 -0
  375. package/src/patterns/toggle/en.ts +61 -0
  376. package/src/patterns/toggle/es.ts +61 -0
  377. package/src/patterns/toggle/hi.ts +80 -0
  378. package/src/patterns/toggle/index.ts +95 -0
  379. package/src/patterns/toggle/it.ts +69 -0
  380. package/src/patterns/toggle/ja.ts +156 -0
  381. package/src/patterns/toggle/ko.ts +113 -0
  382. package/src/patterns/toggle/ms.ts +30 -0
  383. package/src/patterns/toggle/pl.ts +62 -0
  384. package/src/patterns/toggle/ru.ts +62 -0
  385. package/src/patterns/toggle/th.ts +50 -0
  386. package/src/patterns/toggle/tl.ts +30 -0
  387. package/src/patterns/toggle/tr.ts +88 -0
  388. package/src/patterns/toggle/uk.ts +62 -0
  389. package/src/patterns/toggle/vi.ts +61 -0
  390. package/src/patterns/toggle/zh.ts +99 -0
  391. package/src/public-api.ts +286 -0
  392. package/src/registry.ts +441 -0
  393. package/src/tokenizers/arabic.ts +723 -0
  394. package/src/tokenizers/base.ts +1300 -0
  395. package/src/tokenizers/bengali.ts +289 -0
  396. package/src/tokenizers/chinese.ts +481 -0
  397. package/src/tokenizers/english.ts +416 -0
  398. package/src/tokenizers/french.ts +326 -0
  399. package/src/tokenizers/german.ts +324 -0
  400. package/src/tokenizers/hindi.ts +319 -0
  401. package/src/tokenizers/index.ts +127 -0
  402. package/src/tokenizers/indonesian.ts +306 -0
  403. package/src/tokenizers/italian.ts +458 -0
  404. package/src/tokenizers/japanese.ts +447 -0
  405. package/src/tokenizers/korean.ts +642 -0
  406. package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
  407. package/src/tokenizers/morphology/french-normalizer.ts +268 -0
  408. package/src/tokenizers/morphology/german-normalizer.ts +256 -0
  409. package/src/tokenizers/morphology/index.ts +46 -0
  410. package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
  411. package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
  412. package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
  413. package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
  414. package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
  415. package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
  416. package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
  417. package/src/tokenizers/morphology/types.ts +211 -0
  418. package/src/tokenizers/ms.ts +198 -0
  419. package/src/tokenizers/polish.ts +354 -0
  420. package/src/tokenizers/portuguese.ts +304 -0
  421. package/src/tokenizers/quechua.ts +339 -0
  422. package/src/tokenizers/russian.ts +375 -0
  423. package/src/tokenizers/spanish.ts +403 -0
  424. package/src/tokenizers/swahili.ts +303 -0
  425. package/src/tokenizers/thai.ts +236 -0
  426. package/src/tokenizers/tl.ts +198 -0
  427. package/src/tokenizers/turkish.ts +411 -0
  428. package/src/tokenizers/ukrainian.ts +369 -0
  429. package/src/tokenizers/vietnamese.ts +410 -0
  430. package/src/types/grammar-types.ts +617 -0
  431. package/src/types/unified-profile.ts +267 -0
  432. package/src/types.ts +709 -0
  433. package/src/utils/confidence-calculator.ts +147 -0
  434. package/src/validators/command-validator.ts +380 -0
  435. package/src/validators/index.ts +15 -0
@@ -0,0 +1,310 @@
1
+ /**
2
+ * Portuguese Morphological Normalizer
3
+ *
4
+ * Reduces Portuguese verb conjugations to their infinitive forms.
5
+ * Portuguese has three verb conjugation classes (-ar, -er, -ir) and
6
+ * supports reflexive verbs (verbs with -se suffix).
7
+ *
8
+ * Key features:
9
+ * - Reflexive verb handling: mostrar-se → mostrar, esconder-se → esconder
10
+ * - Regular conjugation patterns for -ar, -er, -ir verbs
11
+ * - Handles common irregular verbs
12
+ * - Brazilian Portuguese variants
13
+ *
14
+ * Examples:
15
+ * mostrar-se → mostrar (reflexive infinitive)
16
+ * alternando → alternar (gerund)
17
+ * escondido → esconder (past participle)
18
+ * mostra → mostrar (3rd person present)
19
+ * clicou → clicar (3rd person preterite)
20
+ */
21
+
22
+ import type { MorphologicalNormalizer, NormalizationResult, ConjugationType } from './types';
23
+ import { noChange, normalized } from './types';
24
+
25
+ /**
26
+ * Check if a character is a Portuguese-specific letter (accented characters and ç).
27
+ */
28
+ function isPortugueseSpecificLetter(char: string): boolean {
29
+ return /[áàâãéêíóôõúüçÁÀÂÃÉÊÍÓÔÕÚÜÇ]/.test(char);
30
+ }
31
+
32
+ /**
33
+ * Check if a word looks like a Portuguese verb.
34
+ * Portuguese verbs end in -ar, -er, or -ir, or have Portuguese-specific characters.
35
+ */
36
+ function looksLikePortugueseVerb(word: string): boolean {
37
+ const lower = word.toLowerCase();
38
+ // Check for infinitive endings
39
+ if (lower.endsWith('ar') || lower.endsWith('er') || lower.endsWith('ir')) return true;
40
+ // Check for common conjugation endings
41
+ if (lower.endsWith('ando') || lower.endsWith('endo') || lower.endsWith('indo')) return true;
42
+ if (lower.endsWith('ado') || lower.endsWith('ido')) return true;
43
+ // Check for reflexive -se ending
44
+ if (lower.endsWith('ar-se') || lower.endsWith('er-se') || lower.endsWith('ir-se')) return true;
45
+ // Check for Portuguese-specific characters
46
+ for (const char of word) {
47
+ if (isPortugueseSpecificLetter(char)) return true;
48
+ }
49
+ return false;
50
+ }
51
+
52
+ /**
53
+ * Reflexive pronoun patterns that can be attached to verbs.
54
+ * Portuguese uses hyphenated reflexive pronouns: mostrar-se, esconder-me
55
+ */
56
+ const REFLEXIVE_SUFFIXES = ['-se', '-me', '-te', '-nos', '-vos'];
57
+
58
+ /**
59
+ * -AR verb conjugation endings mapped to infinitive reconstruction.
60
+ */
61
+ const AR_ENDINGS: readonly {
62
+ ending: string;
63
+ stem: string;
64
+ confidence: number;
65
+ type: ConjugationType;
66
+ }[] = [
67
+ // Gerund (-ando)
68
+ { ending: 'ando', stem: 'ar', confidence: 0.88, type: 'gerund' },
69
+ // Past participle (-ado)
70
+ { ending: 'ado', stem: 'ar', confidence: 0.88, type: 'participle' },
71
+ { ending: 'ada', stem: 'ar', confidence: 0.88, type: 'participle' },
72
+ { ending: 'ados', stem: 'ar', confidence: 0.88, type: 'participle' },
73
+ { ending: 'adas', stem: 'ar', confidence: 0.88, type: 'participle' },
74
+ // Present indicative
75
+ { ending: 'o', stem: 'ar', confidence: 0.75, type: 'present' }, // eu
76
+ { ending: 'as', stem: 'ar', confidence: 0.82, type: 'present' }, // tu
77
+ { ending: 'a', stem: 'ar', confidence: 0.75, type: 'present' }, // ele/ela/você
78
+ { ending: 'amos', stem: 'ar', confidence: 0.85, type: 'present' }, // nós
79
+ { ending: 'ais', stem: 'ar', confidence: 0.85, type: 'present' }, // vós
80
+ { ending: 'am', stem: 'ar', confidence: 0.8, type: 'present' }, // eles/elas/vocês
81
+ // Preterite (past)
82
+ { ending: 'ei', stem: 'ar', confidence: 0.88, type: 'past' }, // eu
83
+ { ending: 'aste', stem: 'ar', confidence: 0.88, type: 'past' }, // tu
84
+ { ending: 'ou', stem: 'ar', confidence: 0.88, type: 'past' }, // ele/ela/você
85
+ { ending: 'ámos', stem: 'ar', confidence: 0.88, type: 'past' }, // nós (with accent)
86
+ { ending: 'amos', stem: 'ar', confidence: 0.85, type: 'past' }, // nós (Brazilian)
87
+ { ending: 'astes', stem: 'ar', confidence: 0.88, type: 'past' }, // vós
88
+ { ending: 'aram', stem: 'ar', confidence: 0.88, type: 'past' }, // eles/elas/vocês
89
+ // Imperfect
90
+ { ending: 'ava', stem: 'ar', confidence: 0.88, type: 'past' }, // eu/ele
91
+ { ending: 'avas', stem: 'ar', confidence: 0.88, type: 'past' }, // tu
92
+ { ending: 'ávamos', stem: 'ar', confidence: 0.88, type: 'past' }, // nós
93
+ { ending: 'avamos', stem: 'ar', confidence: 0.85, type: 'past' }, // nós (no accent)
94
+ { ending: 'áveis', stem: 'ar', confidence: 0.88, type: 'past' }, // vós
95
+ { ending: 'aveis', stem: 'ar', confidence: 0.85, type: 'past' }, // vós (no accent)
96
+ { ending: 'avam', stem: 'ar', confidence: 0.88, type: 'past' }, // eles
97
+ // Subjunctive
98
+ { ending: 'e', stem: 'ar', confidence: 0.72, type: 'subjunctive' }, // eu/ele (ambiguous)
99
+ { ending: 'es', stem: 'ar', confidence: 0.78, type: 'subjunctive' }, // tu
100
+ { ending: 'emos', stem: 'ar', confidence: 0.82, type: 'subjunctive' }, // nós
101
+ { ending: 'eis', stem: 'ar', confidence: 0.82, type: 'subjunctive' }, // vós
102
+ { ending: 'em', stem: 'ar', confidence: 0.78, type: 'subjunctive' }, // eles
103
+ // Imperative
104
+ { ending: 'a', stem: 'ar', confidence: 0.75, type: 'imperative' }, // tu/você
105
+ { ending: 'ai', stem: 'ar', confidence: 0.85, type: 'imperative' }, // vós
106
+ // Infinitive
107
+ { ending: 'ar', stem: 'ar', confidence: 0.92, type: 'dictionary' },
108
+ ];
109
+
110
+ /**
111
+ * -ER verb conjugation endings.
112
+ */
113
+ const ER_ENDINGS: readonly {
114
+ ending: string;
115
+ stem: string;
116
+ confidence: number;
117
+ type: ConjugationType;
118
+ }[] = [
119
+ // Gerund (-endo)
120
+ { ending: 'endo', stem: 'er', confidence: 0.88, type: 'gerund' },
121
+ // Past participle (-ido)
122
+ { ending: 'ido', stem: 'er', confidence: 0.85, type: 'participle' },
123
+ { ending: 'ida', stem: 'er', confidence: 0.85, type: 'participle' },
124
+ { ending: 'idos', stem: 'er', confidence: 0.85, type: 'participle' },
125
+ { ending: 'idas', stem: 'er', confidence: 0.85, type: 'participle' },
126
+ // Present indicative
127
+ { ending: 'o', stem: 'er', confidence: 0.72, type: 'present' }, // eu
128
+ { ending: 'es', stem: 'er', confidence: 0.78, type: 'present' }, // tu
129
+ { ending: 'e', stem: 'er', confidence: 0.72, type: 'present' }, // ele
130
+ { ending: 'emos', stem: 'er', confidence: 0.85, type: 'present' }, // nós
131
+ { ending: 'eis', stem: 'er', confidence: 0.82, type: 'present' }, // vós
132
+ { ending: 'em', stem: 'er', confidence: 0.78, type: 'present' }, // eles
133
+ // Preterite
134
+ { ending: 'i', stem: 'er', confidence: 0.85, type: 'past' }, // eu
135
+ { ending: 'este', stem: 'er', confidence: 0.88, type: 'past' }, // tu
136
+ { ending: 'eu', stem: 'er', confidence: 0.88, type: 'past' }, // ele
137
+ { ending: 'emos', stem: 'er', confidence: 0.85, type: 'past' }, // nós
138
+ { ending: 'estes', stem: 'er', confidence: 0.88, type: 'past' }, // vós
139
+ { ending: 'eram', stem: 'er', confidence: 0.88, type: 'past' }, // eles
140
+ // Imperfect
141
+ { ending: 'ia', stem: 'er', confidence: 0.85, type: 'past' }, // eu/ele
142
+ { ending: 'ias', stem: 'er', confidence: 0.85, type: 'past' }, // tu
143
+ { ending: 'íamos', stem: 'er', confidence: 0.88, type: 'past' }, // nós
144
+ { ending: 'iamos', stem: 'er', confidence: 0.85, type: 'past' }, // nós (no accent)
145
+ { ending: 'íeis', stem: 'er', confidence: 0.88, type: 'past' }, // vós
146
+ { ending: 'ieis', stem: 'er', confidence: 0.85, type: 'past' }, // vós (no accent)
147
+ { ending: 'iam', stem: 'er', confidence: 0.85, type: 'past' }, // eles
148
+ // Infinitive
149
+ { ending: 'er', stem: 'er', confidence: 0.92, type: 'dictionary' },
150
+ ];
151
+
152
+ /**
153
+ * -IR verb conjugation endings.
154
+ */
155
+ const IR_ENDINGS: readonly {
156
+ ending: string;
157
+ stem: string;
158
+ confidence: number;
159
+ type: ConjugationType;
160
+ }[] = [
161
+ // Gerund (-indo)
162
+ { ending: 'indo', stem: 'ir', confidence: 0.88, type: 'gerund' },
163
+ // Past participle (-ido)
164
+ { ending: 'ido', stem: 'ir', confidence: 0.85, type: 'participle' },
165
+ { ending: 'ida', stem: 'ir', confidence: 0.85, type: 'participle' },
166
+ { ending: 'idos', stem: 'ir', confidence: 0.85, type: 'participle' },
167
+ { ending: 'idas', stem: 'ir', confidence: 0.85, type: 'participle' },
168
+ // Present indicative
169
+ { ending: 'o', stem: 'ir', confidence: 0.72, type: 'present' }, // eu
170
+ { ending: 'es', stem: 'ir', confidence: 0.78, type: 'present' }, // tu
171
+ { ending: 'e', stem: 'ir', confidence: 0.72, type: 'present' }, // ele
172
+ { ending: 'imos', stem: 'ir', confidence: 0.85, type: 'present' }, // nós
173
+ { ending: 'is', stem: 'ir', confidence: 0.82, type: 'present' }, // vós
174
+ { ending: 'em', stem: 'ir', confidence: 0.78, type: 'present' }, // eles
175
+ // Preterite (same as -er)
176
+ { ending: 'i', stem: 'ir', confidence: 0.85, type: 'past' }, // eu
177
+ { ending: 'iste', stem: 'ir', confidence: 0.88, type: 'past' }, // tu
178
+ { ending: 'iu', stem: 'ir', confidence: 0.88, type: 'past' }, // ele
179
+ { ending: 'imos', stem: 'ir', confidence: 0.85, type: 'past' }, // nós
180
+ { ending: 'istes', stem: 'ir', confidence: 0.88, type: 'past' }, // vós
181
+ { ending: 'iram', stem: 'ir', confidence: 0.88, type: 'past' }, // eles
182
+ // Imperfect (same as -er)
183
+ { ending: 'ia', stem: 'ir', confidence: 0.85, type: 'past' },
184
+ { ending: 'ias', stem: 'ir', confidence: 0.85, type: 'past' },
185
+ { ending: 'íamos', stem: 'ir', confidence: 0.88, type: 'past' },
186
+ { ending: 'iamos', stem: 'ir', confidence: 0.85, type: 'past' },
187
+ { ending: 'íeis', stem: 'ir', confidence: 0.88, type: 'past' },
188
+ { ending: 'ieis', stem: 'ir', confidence: 0.85, type: 'past' },
189
+ { ending: 'iam', stem: 'ir', confidence: 0.85, type: 'past' },
190
+ // Infinitive
191
+ { ending: 'ir', stem: 'ir', confidence: 0.92, type: 'dictionary' },
192
+ ];
193
+
194
+ /**
195
+ * All endings combined, sorted by length (longest first).
196
+ */
197
+ const ALL_ENDINGS = [...AR_ENDINGS, ...ER_ENDINGS, ...IR_ENDINGS].sort(
198
+ (a, b) => b.ending.length - a.ending.length
199
+ );
200
+
201
+ /**
202
+ * Portuguese morphological normalizer.
203
+ */
204
+ export class PortugueseMorphologicalNormalizer implements MorphologicalNormalizer {
205
+ readonly language = 'pt';
206
+
207
+ /**
208
+ * Check if a word might be a Portuguese verb that can be normalized.
209
+ */
210
+ isNormalizable(word: string): boolean {
211
+ if (word.length < 3) return false;
212
+ return looksLikePortugueseVerb(word);
213
+ }
214
+
215
+ /**
216
+ * Normalize a Portuguese word to its infinitive form.
217
+ */
218
+ normalize(word: string): NormalizationResult {
219
+ const lower = word.toLowerCase();
220
+
221
+ // Check if this is already an infinitive (no change needed)
222
+ if (lower.endsWith('ar') || lower.endsWith('er') || lower.endsWith('ir')) {
223
+ // If it's a simple infinitive, return as-is with 1.0 confidence
224
+ // (unless it's a reflexive like "mostrar-se")
225
+ if (!REFLEXIVE_SUFFIXES.some(s => lower.endsWith(s))) {
226
+ return noChange(word);
227
+ }
228
+ }
229
+
230
+ // Try reflexive verb normalization first (highest priority)
231
+ const reflexiveResult = this.tryReflexiveNormalization(lower);
232
+ if (reflexiveResult) return reflexiveResult;
233
+
234
+ // Try standard conjugation normalization
235
+ const conjugationResult = this.tryConjugationNormalization(lower);
236
+ if (conjugationResult) return conjugationResult;
237
+
238
+ // No normalization needed
239
+ return noChange(word);
240
+ }
241
+
242
+ /**
243
+ * Try to normalize a reflexive verb.
244
+ * Portuguese reflexive verbs use hyphenated pronouns: mostrar-se, esconder-me
245
+ *
246
+ * Examples:
247
+ * mostrar-se → mostrar
248
+ * esconder-se → esconder
249
+ * exibir-se → exibir
250
+ */
251
+ private tryReflexiveNormalization(word: string): NormalizationResult | null {
252
+ for (const suffix of REFLEXIVE_SUFFIXES) {
253
+ if (word.endsWith(suffix)) {
254
+ const withoutReflexive = word.slice(0, -suffix.length);
255
+
256
+ // Check if this looks like an infinitive
257
+ if (
258
+ withoutReflexive.endsWith('ar') ||
259
+ withoutReflexive.endsWith('er') ||
260
+ withoutReflexive.endsWith('ir')
261
+ ) {
262
+ // It's a reflexive infinitive (e.g., mostrar-se → mostrar)
263
+ return normalized(withoutReflexive, 0.88, {
264
+ removedSuffixes: [suffix],
265
+ conjugationType: 'reflexive',
266
+ });
267
+ }
268
+
269
+ // Try to normalize the remaining part as a conjugated verb
270
+ const innerResult = this.tryConjugationNormalization(withoutReflexive);
271
+ if (innerResult && innerResult.stem !== withoutReflexive) {
272
+ // It's a reflexive conjugated form
273
+ return normalized(innerResult.stem, innerResult.confidence * 0.95, {
274
+ removedSuffixes: [suffix, ...(innerResult.metadata?.removedSuffixes || [])],
275
+ conjugationType: 'reflexive',
276
+ });
277
+ }
278
+ }
279
+ }
280
+
281
+ return null;
282
+ }
283
+
284
+ /**
285
+ * Try to normalize a conjugated verb to its infinitive.
286
+ */
287
+ private tryConjugationNormalization(word: string): NormalizationResult | null {
288
+ for (const rule of ALL_ENDINGS) {
289
+ if (word.endsWith(rule.ending)) {
290
+ const stemBase = word.slice(0, -rule.ending.length);
291
+
292
+ // Must have a meaningful stem (at least 2 characters)
293
+ if (stemBase.length < 2) continue;
294
+
295
+ // Reconstruct infinitive
296
+ const infinitive = stemBase + rule.stem;
297
+
298
+ return normalized(infinitive, rule.confidence, {
299
+ removedSuffixes: [rule.ending],
300
+ conjugationType: rule.type,
301
+ });
302
+ }
303
+ }
304
+
305
+ return null;
306
+ }
307
+ }
308
+
309
+ // Export singleton instance
310
+ export const portugueseMorphologicalNormalizer = new PortugueseMorphologicalNormalizer();
@@ -0,0 +1,327 @@
1
+ /**
2
+ * Spanish Morphological Normalizer
3
+ *
4
+ * Reduces Spanish verb conjugations to their infinitive forms.
5
+ * Spanish has three verb conjugation classes (-ar, -er, -ir) and
6
+ * supports reflexive verbs (verbs with -se suffix).
7
+ *
8
+ * Key features:
9
+ * - Reflexive verb handling: mostrarse → mostrar, ocultarse → ocultar
10
+ * - Regular conjugation patterns for -ar, -er, -ir verbs
11
+ * - Handles common irregular verbs
12
+ *
13
+ * Examples:
14
+ * mostrarse → mostrar (reflexive infinitive)
15
+ * alternando → alternar (gerund)
16
+ * escondido → esconder (past participle)
17
+ * muestra → mostrar (3rd person present)
18
+ */
19
+
20
+ import type { MorphologicalNormalizer, NormalizationResult, ConjugationType } from './types';
21
+ import { noChange, normalized } from './types';
22
+
23
+ /**
24
+ * Check if a character is a Spanish-specific letter (accented characters and ñ).
25
+ */
26
+ function isSpanishSpecificLetter(char: string): boolean {
27
+ return /[áéíóúüñÁÉÍÓÚÜÑ]/.test(char);
28
+ }
29
+
30
+ /**
31
+ * Check if a word looks like a Spanish verb.
32
+ * Spanish verbs end in -ar, -er, or -ir, or have Spanish-specific characters.
33
+ */
34
+ function looksLikeSpanishVerb(word: string): boolean {
35
+ const lower = word.toLowerCase();
36
+ // Check for infinitive endings
37
+ if (lower.endsWith('ar') || lower.endsWith('er') || lower.endsWith('ir')) return true;
38
+ // Check for common conjugation endings
39
+ if (lower.endsWith('ando') || lower.endsWith('iendo')) return true;
40
+ if (lower.endsWith('ado') || lower.endsWith('ido')) return true;
41
+ // Check for reflexive -se ending
42
+ if (lower.endsWith('arse') || lower.endsWith('erse') || lower.endsWith('irse')) return true;
43
+ // Check for Spanish-specific characters
44
+ for (const char of word) {
45
+ if (isSpanishSpecificLetter(char)) return true;
46
+ }
47
+ return false;
48
+ }
49
+
50
+ /**
51
+ * Reflexive pronoun patterns that can be attached to verbs.
52
+ */
53
+ const REFLEXIVE_SUFFIXES = ['se', 'me', 'te', 'nos', 'os'];
54
+
55
+ /**
56
+ * Reflexive pronouns that appear before conjugated verbs.
57
+ * Note: These are handled at the tokenizer level, not here.
58
+ */
59
+ // const REFLEXIVE_PREFIXES = ['me', 'te', 'se', 'nos', 'os'];
60
+
61
+ /**
62
+ * -AR verb conjugation endings mapped to infinitive reconstruction.
63
+ */
64
+ const AR_ENDINGS: readonly {
65
+ ending: string;
66
+ stem: string;
67
+ confidence: number;
68
+ type: ConjugationType;
69
+ }[] = [
70
+ // Gerund (-ando)
71
+ { ending: 'ando', stem: 'ar', confidence: 0.88, type: 'gerund' },
72
+ // Past participle (-ado)
73
+ { ending: 'ado', stem: 'ar', confidence: 0.88, type: 'participle' },
74
+ { ending: 'ada', stem: 'ar', confidence: 0.88, type: 'participle' },
75
+ { ending: 'ados', stem: 'ar', confidence: 0.88, type: 'participle' },
76
+ { ending: 'adas', stem: 'ar', confidence: 0.88, type: 'participle' },
77
+ // Present indicative
78
+ { ending: 'o', stem: 'ar', confidence: 0.75, type: 'present' }, // yo
79
+ { ending: 'as', stem: 'ar', confidence: 0.82, type: 'present' }, // tú
80
+ { ending: 'a', stem: 'ar', confidence: 0.75, type: 'present' }, // él/ella
81
+ { ending: 'amos', stem: 'ar', confidence: 0.85, type: 'present' }, // nosotros
82
+ { ending: 'áis', stem: 'ar', confidence: 0.85, type: 'present' }, // vosotros
83
+ { ending: 'ais', stem: 'ar', confidence: 0.82, type: 'present' }, // vosotros (no accent)
84
+ { ending: 'an', stem: 'ar', confidence: 0.8, type: 'present' }, // ellos
85
+ // Preterite
86
+ { ending: 'é', stem: 'ar', confidence: 0.85, type: 'past' }, // yo
87
+ { ending: 'aste', stem: 'ar', confidence: 0.88, type: 'past' }, // tú
88
+ { ending: 'ó', stem: 'ar', confidence: 0.82, type: 'past' }, // él/ella
89
+ { ending: 'amos', stem: 'ar', confidence: 0.85, type: 'past' }, // nosotros (same as present)
90
+ { ending: 'asteis', stem: 'ar', confidence: 0.88, type: 'past' }, // vosotros
91
+ { ending: 'aron', stem: 'ar', confidence: 0.88, type: 'past' }, // ellos
92
+ // Imperfect
93
+ { ending: 'aba', stem: 'ar', confidence: 0.88, type: 'past' }, // yo/él
94
+ { ending: 'abas', stem: 'ar', confidence: 0.88, type: 'past' }, // tú
95
+ { ending: 'ábamos', stem: 'ar', confidence: 0.88, type: 'past' }, // nosotros
96
+ { ending: 'abamos', stem: 'ar', confidence: 0.85, type: 'past' }, // nosotros (no accent)
97
+ { ending: 'abais', stem: 'ar', confidence: 0.88, type: 'past' }, // vosotros
98
+ { ending: 'aban', stem: 'ar', confidence: 0.88, type: 'past' }, // ellos
99
+ // Subjunctive
100
+ { ending: 'e', stem: 'ar', confidence: 0.72, type: 'subjunctive' }, // yo/él (ambiguous)
101
+ { ending: 'es', stem: 'ar', confidence: 0.78, type: 'subjunctive' }, // tú
102
+ { ending: 'emos', stem: 'ar', confidence: 0.82, type: 'subjunctive' }, // nosotros
103
+ { ending: 'éis', stem: 'ar', confidence: 0.85, type: 'subjunctive' }, // vosotros
104
+ { ending: 'eis', stem: 'ar', confidence: 0.82, type: 'subjunctive' }, // vosotros (no accent)
105
+ { ending: 'en', stem: 'ar', confidence: 0.78, type: 'subjunctive' }, // ellos
106
+ // Imperative
107
+ { ending: 'a', stem: 'ar', confidence: 0.75, type: 'imperative' }, // tú (same as 3rd present)
108
+ { ending: 'ad', stem: 'ar', confidence: 0.85, type: 'imperative' }, // vosotros
109
+ // Infinitive
110
+ { ending: 'ar', stem: 'ar', confidence: 0.92, type: 'dictionary' },
111
+ ];
112
+
113
+ /**
114
+ * -ER verb conjugation endings.
115
+ */
116
+ const ER_ENDINGS: readonly {
117
+ ending: string;
118
+ stem: string;
119
+ confidence: number;
120
+ type: ConjugationType;
121
+ }[] = [
122
+ // Gerund (-iendo)
123
+ { ending: 'iendo', stem: 'er', confidence: 0.88, type: 'gerund' },
124
+ // Past participle (-ido)
125
+ { ending: 'ido', stem: 'er', confidence: 0.85, type: 'participle' },
126
+ { ending: 'ida', stem: 'er', confidence: 0.85, type: 'participle' },
127
+ { ending: 'idos', stem: 'er', confidence: 0.85, type: 'participle' },
128
+ { ending: 'idas', stem: 'er', confidence: 0.85, type: 'participle' },
129
+ // Present indicative
130
+ { ending: 'o', stem: 'er', confidence: 0.72, type: 'present' }, // yo
131
+ { ending: 'es', stem: 'er', confidence: 0.78, type: 'present' }, // tú
132
+ { ending: 'e', stem: 'er', confidence: 0.72, type: 'present' }, // él/ella
133
+ { ending: 'emos', stem: 'er', confidence: 0.85, type: 'present' }, // nosotros
134
+ { ending: 'éis', stem: 'er', confidence: 0.85, type: 'present' }, // vosotros
135
+ { ending: 'eis', stem: 'er', confidence: 0.82, type: 'present' }, // vosotros (no accent)
136
+ { ending: 'en', stem: 'er', confidence: 0.78, type: 'present' }, // ellos
137
+ // Preterite
138
+ { ending: 'í', stem: 'er', confidence: 0.85, type: 'past' }, // yo
139
+ { ending: 'iste', stem: 'er', confidence: 0.88, type: 'past' }, // tú
140
+ { ending: 'ió', stem: 'er', confidence: 0.85, type: 'past' }, // él/ella
141
+ { ending: 'io', stem: 'er', confidence: 0.82, type: 'past' }, // él/ella (no accent)
142
+ { ending: 'imos', stem: 'er', confidence: 0.85, type: 'past' }, // nosotros
143
+ { ending: 'isteis', stem: 'er', confidence: 0.88, type: 'past' }, // vosotros
144
+ { ending: 'ieron', stem: 'er', confidence: 0.88, type: 'past' }, // ellos
145
+ // Imperfect
146
+ { ending: 'ía', stem: 'er', confidence: 0.88, type: 'past' }, // yo/él
147
+ { ending: 'ia', stem: 'er', confidence: 0.85, type: 'past' }, // yo/él (no accent)
148
+ { ending: 'ías', stem: 'er', confidence: 0.88, type: 'past' }, // tú
149
+ { ending: 'ias', stem: 'er', confidence: 0.85, type: 'past' }, // tú (no accent)
150
+ { ending: 'íamos', stem: 'er', confidence: 0.88, type: 'past' }, // nosotros
151
+ { ending: 'iamos', stem: 'er', confidence: 0.85, type: 'past' }, // nosotros (no accent)
152
+ { ending: 'íais', stem: 'er', confidence: 0.88, type: 'past' }, // vosotros
153
+ { ending: 'iais', stem: 'er', confidence: 0.85, type: 'past' }, // vosotros (no accent)
154
+ { ending: 'ían', stem: 'er', confidence: 0.88, type: 'past' }, // ellos
155
+ { ending: 'ian', stem: 'er', confidence: 0.85, type: 'past' }, // ellos (no accent)
156
+ // Infinitive
157
+ { ending: 'er', stem: 'er', confidence: 0.92, type: 'dictionary' },
158
+ ];
159
+
160
+ /**
161
+ * -IR verb conjugation endings.
162
+ */
163
+ const IR_ENDINGS: readonly {
164
+ ending: string;
165
+ stem: string;
166
+ confidence: number;
167
+ type: ConjugationType;
168
+ }[] = [
169
+ // Gerund (-iendo)
170
+ { ending: 'iendo', stem: 'ir', confidence: 0.88, type: 'gerund' },
171
+ // Past participle (-ido)
172
+ { ending: 'ido', stem: 'ir', confidence: 0.85, type: 'participle' },
173
+ { ending: 'ida', stem: 'ir', confidence: 0.85, type: 'participle' },
174
+ { ending: 'idos', stem: 'ir', confidence: 0.85, type: 'participle' },
175
+ { ending: 'idas', stem: 'ir', confidence: 0.85, type: 'participle' },
176
+ // Present indicative
177
+ { ending: 'o', stem: 'ir', confidence: 0.72, type: 'present' }, // yo
178
+ { ending: 'es', stem: 'ir', confidence: 0.78, type: 'present' }, // tú
179
+ { ending: 'e', stem: 'ir', confidence: 0.72, type: 'present' }, // él/ella
180
+ { ending: 'imos', stem: 'ir', confidence: 0.85, type: 'present' }, // nosotros
181
+ { ending: 'ís', stem: 'ir', confidence: 0.85, type: 'present' }, // vosotros
182
+ { ending: 'is', stem: 'ir', confidence: 0.82, type: 'present' }, // vosotros (no accent)
183
+ { ending: 'en', stem: 'ir', confidence: 0.78, type: 'present' }, // ellos
184
+ // Preterite (same as -er)
185
+ { ending: 'í', stem: 'ir', confidence: 0.85, type: 'past' }, // yo
186
+ { ending: 'iste', stem: 'ir', confidence: 0.88, type: 'past' }, // tú
187
+ { ending: 'ió', stem: 'ir', confidence: 0.85, type: 'past' }, // él/ella
188
+ { ending: 'io', stem: 'ir', confidence: 0.82, type: 'past' }, // él/ella (no accent)
189
+ { ending: 'imos', stem: 'ir', confidence: 0.85, type: 'past' }, // nosotros
190
+ { ending: 'isteis', stem: 'ir', confidence: 0.88, type: 'past' }, // vosotros
191
+ { ending: 'ieron', stem: 'ir', confidence: 0.88, type: 'past' }, // ellos
192
+ // Imperfect (same as -er)
193
+ { ending: 'ía', stem: 'ir', confidence: 0.88, type: 'past' },
194
+ { ending: 'ia', stem: 'ir', confidence: 0.85, type: 'past' },
195
+ { ending: 'ías', stem: 'ir', confidence: 0.88, type: 'past' },
196
+ { ending: 'ias', stem: 'ir', confidence: 0.85, type: 'past' },
197
+ { ending: 'íamos', stem: 'ir', confidence: 0.88, type: 'past' },
198
+ { ending: 'iamos', stem: 'ir', confidence: 0.85, type: 'past' },
199
+ { ending: 'íais', stem: 'ir', confidence: 0.88, type: 'past' },
200
+ { ending: 'iais', stem: 'ir', confidence: 0.85, type: 'past' },
201
+ { ending: 'ían', stem: 'ir', confidence: 0.88, type: 'past' },
202
+ { ending: 'ian', stem: 'ir', confidence: 0.85, type: 'past' },
203
+ // Infinitive
204
+ { ending: 'ir', stem: 'ir', confidence: 0.92, type: 'dictionary' },
205
+ ];
206
+
207
+ /**
208
+ * All endings combined, sorted by length (longest first).
209
+ */
210
+ const ALL_ENDINGS = [...AR_ENDINGS, ...ER_ENDINGS, ...IR_ENDINGS].sort(
211
+ (a, b) => b.ending.length - a.ending.length
212
+ );
213
+
214
+ /**
215
+ * Spanish morphological normalizer.
216
+ */
217
+ export class SpanishMorphologicalNormalizer implements MorphologicalNormalizer {
218
+ readonly language = 'es';
219
+
220
+ /**
221
+ * Check if a word might be a Spanish verb that can be normalized.
222
+ */
223
+ isNormalizable(word: string): boolean {
224
+ if (word.length < 3) return false;
225
+ return looksLikeSpanishVerb(word);
226
+ }
227
+
228
+ /**
229
+ * Normalize a Spanish word to its infinitive form.
230
+ */
231
+ normalize(word: string): NormalizationResult {
232
+ const lower = word.toLowerCase();
233
+
234
+ // Check if this is already an infinitive (no change needed)
235
+ if (lower.endsWith('ar') || lower.endsWith('er') || lower.endsWith('ir')) {
236
+ // If it's a simple infinitive, return as-is with 1.0 confidence
237
+ // (unless it's a reflexive like "mostrarse")
238
+ if (
239
+ !REFLEXIVE_SUFFIXES.some(
240
+ s => lower.endsWith(s + 'ar') || lower.endsWith(s + 'er') || lower.endsWith(s + 'ir')
241
+ )
242
+ ) {
243
+ return noChange(word);
244
+ }
245
+ }
246
+
247
+ // Try reflexive verb normalization first (highest priority)
248
+ const reflexiveResult = this.tryReflexiveNormalization(lower);
249
+ if (reflexiveResult) return reflexiveResult;
250
+
251
+ // Try standard conjugation normalization
252
+ const conjugationResult = this.tryConjugationNormalization(lower);
253
+ if (conjugationResult) return conjugationResult;
254
+
255
+ // No normalization needed
256
+ return noChange(word);
257
+ }
258
+
259
+ /**
260
+ * Try to normalize a reflexive verb.
261
+ * Reflexive verbs end with -se, -me, -te, -nos, -os attached to infinitive.
262
+ *
263
+ * Examples:
264
+ * mostrarse → mostrar
265
+ * ocultarse → ocultar
266
+ * esconderse → esconder
267
+ */
268
+ private tryReflexiveNormalization(word: string): NormalizationResult | null {
269
+ for (const suffix of REFLEXIVE_SUFFIXES) {
270
+ if (word.endsWith(suffix)) {
271
+ const withoutReflexive = word.slice(0, -suffix.length);
272
+
273
+ // Check if this looks like an infinitive
274
+ if (
275
+ withoutReflexive.endsWith('ar') ||
276
+ withoutReflexive.endsWith('er') ||
277
+ withoutReflexive.endsWith('ir')
278
+ ) {
279
+ // It's a reflexive infinitive (e.g., mostrarse → mostrar)
280
+ return normalized(withoutReflexive, 0.88, {
281
+ removedSuffixes: [suffix],
282
+ conjugationType: 'reflexive',
283
+ });
284
+ }
285
+
286
+ // Try to normalize the remaining part as a conjugated verb
287
+ const innerResult = this.tryConjugationNormalization(withoutReflexive);
288
+ if (innerResult && innerResult.stem !== withoutReflexive) {
289
+ // It's a reflexive conjugated form (e.g., muestrase → mostrar)
290
+ return normalized(innerResult.stem, innerResult.confidence * 0.95, {
291
+ removedSuffixes: [suffix, ...(innerResult.metadata?.removedSuffixes || [])],
292
+ conjugationType: 'reflexive',
293
+ });
294
+ }
295
+ }
296
+ }
297
+
298
+ return null;
299
+ }
300
+
301
+ /**
302
+ * Try to normalize a conjugated verb to its infinitive.
303
+ */
304
+ private tryConjugationNormalization(word: string): NormalizationResult | null {
305
+ for (const rule of ALL_ENDINGS) {
306
+ if (word.endsWith(rule.ending)) {
307
+ const stemBase = word.slice(0, -rule.ending.length);
308
+
309
+ // Must have a meaningful stem (at least 2 characters)
310
+ if (stemBase.length < 2) continue;
311
+
312
+ // Reconstruct infinitive
313
+ const infinitive = stemBase + rule.stem;
314
+
315
+ return normalized(infinitive, rule.confidence, {
316
+ removedSuffixes: [rule.ending],
317
+ conjugationType: rule.type,
318
+ });
319
+ }
320
+ }
321
+
322
+ return null;
323
+ }
324
+ }
325
+
326
+ // Export singleton instance
327
+ export const spanishMorphologicalNormalizer = new SpanishMorphologicalNormalizer();