@lokascript/semantic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +686 -0
  3. package/dist/browser-ar.ar.global.js +2 -0
  4. package/dist/browser-core.core.global.js +2 -0
  5. package/dist/browser-de.de.global.js +2 -0
  6. package/dist/browser-east-asian.east-asian.global.js +2 -0
  7. package/dist/browser-en-tr.en-tr.global.js +2 -0
  8. package/dist/browser-en.en.global.js +2 -0
  9. package/dist/browser-es-en.es-en.global.js +2 -0
  10. package/dist/browser-es.es.global.js +2 -0
  11. package/dist/browser-fr.fr.global.js +2 -0
  12. package/dist/browser-id.id.global.js +2 -0
  13. package/dist/browser-ja.ja.global.js +2 -0
  14. package/dist/browser-ko.ko.global.js +2 -0
  15. package/dist/browser-lazy.lazy.global.js +2 -0
  16. package/dist/browser-priority.priority.global.js +2 -0
  17. package/dist/browser-pt.pt.global.js +2 -0
  18. package/dist/browser-qu.qu.global.js +2 -0
  19. package/dist/browser-sw.sw.global.js +2 -0
  20. package/dist/browser-tr.tr.global.js +2 -0
  21. package/dist/browser-western.western.global.js +2 -0
  22. package/dist/browser-zh.zh.global.js +2 -0
  23. package/dist/browser.global.js +3 -0
  24. package/dist/browser.global.js.map +1 -0
  25. package/dist/index.cjs +35051 -0
  26. package/dist/index.cjs.map +1 -0
  27. package/dist/index.d.cts +3426 -0
  28. package/dist/index.d.ts +3426 -0
  29. package/dist/index.js +34890 -0
  30. package/dist/index.js.map +1 -0
  31. package/dist/languages/ar.d.ts +78 -0
  32. package/dist/languages/ar.js +1622 -0
  33. package/dist/languages/ar.js.map +1 -0
  34. package/dist/languages/de.d.ts +38 -0
  35. package/dist/languages/de.js +1168 -0
  36. package/dist/languages/de.js.map +1 -0
  37. package/dist/languages/en.d.ts +44 -0
  38. package/dist/languages/en.js +3491 -0
  39. package/dist/languages/en.js.map +1 -0
  40. package/dist/languages/es.d.ts +52 -0
  41. package/dist/languages/es.js +1493 -0
  42. package/dist/languages/es.js.map +1 -0
  43. package/dist/languages/fr.d.ts +37 -0
  44. package/dist/languages/fr.js +1159 -0
  45. package/dist/languages/fr.js.map +1 -0
  46. package/dist/languages/id.d.ts +35 -0
  47. package/dist/languages/id.js +1152 -0
  48. package/dist/languages/id.js.map +1 -0
  49. package/dist/languages/ja.d.ts +53 -0
  50. package/dist/languages/ja.js +1430 -0
  51. package/dist/languages/ja.js.map +1 -0
  52. package/dist/languages/ko.d.ts +51 -0
  53. package/dist/languages/ko.js +1729 -0
  54. package/dist/languages/ko.js.map +1 -0
  55. package/dist/languages/pt.d.ts +37 -0
  56. package/dist/languages/pt.js +1127 -0
  57. package/dist/languages/pt.js.map +1 -0
  58. package/dist/languages/qu.d.ts +36 -0
  59. package/dist/languages/qu.js +1143 -0
  60. package/dist/languages/qu.js.map +1 -0
  61. package/dist/languages/sw.d.ts +35 -0
  62. package/dist/languages/sw.js +1147 -0
  63. package/dist/languages/sw.js.map +1 -0
  64. package/dist/languages/tr.d.ts +45 -0
  65. package/dist/languages/tr.js +1529 -0
  66. package/dist/languages/tr.js.map +1 -0
  67. package/dist/languages/zh.d.ts +58 -0
  68. package/dist/languages/zh.js +1257 -0
  69. package/dist/languages/zh.js.map +1 -0
  70. package/dist/types-C4dcj53L.d.ts +600 -0
  71. package/package.json +202 -0
  72. package/src/__test-utils__/index.ts +7 -0
  73. package/src/__test-utils__/test-helpers.ts +8 -0
  74. package/src/__types__/test-helpers.ts +122 -0
  75. package/src/analysis/index.ts +479 -0
  76. package/src/ast-builder/command-mappers.ts +1133 -0
  77. package/src/ast-builder/expression-parser/index.ts +41 -0
  78. package/src/ast-builder/expression-parser/parser.ts +563 -0
  79. package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
  80. package/src/ast-builder/expression-parser/types.ts +208 -0
  81. package/src/ast-builder/index.ts +536 -0
  82. package/src/ast-builder/value-converters.ts +172 -0
  83. package/src/bridge.ts +275 -0
  84. package/src/browser-ar.ts +162 -0
  85. package/src/browser-core.ts +231 -0
  86. package/src/browser-de.ts +162 -0
  87. package/src/browser-east-asian.ts +173 -0
  88. package/src/browser-en-tr.ts +165 -0
  89. package/src/browser-en.ts +157 -0
  90. package/src/browser-es-en.ts +200 -0
  91. package/src/browser-es.ts +170 -0
  92. package/src/browser-fr.ts +162 -0
  93. package/src/browser-id.ts +162 -0
  94. package/src/browser-ja.ts +162 -0
  95. package/src/browser-ko.ts +162 -0
  96. package/src/browser-lazy.ts +189 -0
  97. package/src/browser-priority.ts +214 -0
  98. package/src/browser-pt.ts +162 -0
  99. package/src/browser-qu.ts +162 -0
  100. package/src/browser-sw.ts +162 -0
  101. package/src/browser-tr.ts +162 -0
  102. package/src/browser-western.ts +181 -0
  103. package/src/browser-zh.ts +162 -0
  104. package/src/browser.ts +268 -0
  105. package/src/cache/index.ts +14 -0
  106. package/src/cache/semantic-cache.ts +344 -0
  107. package/src/core-bridge.ts +372 -0
  108. package/src/explicit/converter.ts +258 -0
  109. package/src/explicit/index.ts +18 -0
  110. package/src/explicit/parser.ts +236 -0
  111. package/src/explicit/renderer.ts +424 -0
  112. package/src/generators/command-schemas.ts +1636 -0
  113. package/src/generators/event-handler-generator.ts +109 -0
  114. package/src/generators/index.ts +117 -0
  115. package/src/generators/language-profiles.ts +139 -0
  116. package/src/generators/pattern-generator.ts +537 -0
  117. package/src/generators/profiles/arabic.ts +131 -0
  118. package/src/generators/profiles/bengali.ts +132 -0
  119. package/src/generators/profiles/chinese.ts +124 -0
  120. package/src/generators/profiles/english.ts +113 -0
  121. package/src/generators/profiles/french.ts +125 -0
  122. package/src/generators/profiles/german.ts +126 -0
  123. package/src/generators/profiles/hindi.ts +146 -0
  124. package/src/generators/profiles/index.ts +46 -0
  125. package/src/generators/profiles/indonesian.ts +125 -0
  126. package/src/generators/profiles/italian.ts +139 -0
  127. package/src/generators/profiles/japanese.ts +149 -0
  128. package/src/generators/profiles/korean.ts +127 -0
  129. package/src/generators/profiles/marker-templates.ts +288 -0
  130. package/src/generators/profiles/ms.ts +130 -0
  131. package/src/generators/profiles/polish.ts +249 -0
  132. package/src/generators/profiles/portuguese.ts +115 -0
  133. package/src/generators/profiles/quechua.ts +113 -0
  134. package/src/generators/profiles/russian.ts +260 -0
  135. package/src/generators/profiles/spanish.ts +130 -0
  136. package/src/generators/profiles/swahili.ts +129 -0
  137. package/src/generators/profiles/thai.ts +132 -0
  138. package/src/generators/profiles/tl.ts +128 -0
  139. package/src/generators/profiles/turkish.ts +124 -0
  140. package/src/generators/profiles/types.ts +165 -0
  141. package/src/generators/profiles/ukrainian.ts +270 -0
  142. package/src/generators/profiles/vietnamese.ts +133 -0
  143. package/src/generators/schema-error-codes.ts +160 -0
  144. package/src/generators/schema-validator.ts +391 -0
  145. package/src/index.ts +429 -0
  146. package/src/language-building-schema.ts +3170 -0
  147. package/src/language-loader.ts +394 -0
  148. package/src/languages/_all.ts +65 -0
  149. package/src/languages/ar.ts +15 -0
  150. package/src/languages/bn.ts +16 -0
  151. package/src/languages/de.ts +15 -0
  152. package/src/languages/en.ts +29 -0
  153. package/src/languages/es.ts +15 -0
  154. package/src/languages/fr.ts +15 -0
  155. package/src/languages/hi.ts +26 -0
  156. package/src/languages/id.ts +15 -0
  157. package/src/languages/index.ts +18 -0
  158. package/src/languages/it.ts +15 -0
  159. package/src/languages/ja.ts +15 -0
  160. package/src/languages/ko.ts +15 -0
  161. package/src/languages/ms.ts +16 -0
  162. package/src/languages/pl.ts +18 -0
  163. package/src/languages/pt.ts +15 -0
  164. package/src/languages/qu.ts +15 -0
  165. package/src/languages/ru.ts +26 -0
  166. package/src/languages/sw.ts +15 -0
  167. package/src/languages/th.ts +16 -0
  168. package/src/languages/tl.ts +16 -0
  169. package/src/languages/tr.ts +15 -0
  170. package/src/languages/uk.ts +26 -0
  171. package/src/languages/vi.ts +16 -0
  172. package/src/languages/zh.ts +15 -0
  173. package/src/parser/index.ts +15 -0
  174. package/src/parser/pattern-matcher.ts +1181 -0
  175. package/src/parser/semantic-parser.ts +573 -0
  176. package/src/parser/utils/index.ts +35 -0
  177. package/src/parser/utils/marker-resolution.ts +111 -0
  178. package/src/parser/utils/possessive-keywords.ts +43 -0
  179. package/src/parser/utils/role-positioning.ts +70 -0
  180. package/src/parser/utils/type-validation.ts +134 -0
  181. package/src/patterns/add/ar.ts +71 -0
  182. package/src/patterns/add/bn.ts +70 -0
  183. package/src/patterns/add/hi.ts +69 -0
  184. package/src/patterns/add/index.ts +87 -0
  185. package/src/patterns/add/it.ts +61 -0
  186. package/src/patterns/add/ja.ts +93 -0
  187. package/src/patterns/add/ko.ts +74 -0
  188. package/src/patterns/add/ms.ts +30 -0
  189. package/src/patterns/add/pl.ts +62 -0
  190. package/src/patterns/add/ru.ts +62 -0
  191. package/src/patterns/add/th.ts +49 -0
  192. package/src/patterns/add/tl.ts +30 -0
  193. package/src/patterns/add/tr.ts +71 -0
  194. package/src/patterns/add/uk.ts +62 -0
  195. package/src/patterns/add/vi.ts +61 -0
  196. package/src/patterns/add/zh.ts +71 -0
  197. package/src/patterns/builders.ts +207 -0
  198. package/src/patterns/decrement/bn.ts +70 -0
  199. package/src/patterns/decrement/de.ts +42 -0
  200. package/src/patterns/decrement/hi.ts +68 -0
  201. package/src/patterns/decrement/index.ts +79 -0
  202. package/src/patterns/decrement/it.ts +69 -0
  203. package/src/patterns/decrement/ms.ts +30 -0
  204. package/src/patterns/decrement/pl.ts +58 -0
  205. package/src/patterns/decrement/ru.ts +58 -0
  206. package/src/patterns/decrement/th.ts +49 -0
  207. package/src/patterns/decrement/tl.ts +30 -0
  208. package/src/patterns/decrement/tr.ts +48 -0
  209. package/src/patterns/decrement/uk.ts +58 -0
  210. package/src/patterns/decrement/vi.ts +61 -0
  211. package/src/patterns/decrement/zh.ts +32 -0
  212. package/src/patterns/en.ts +302 -0
  213. package/src/patterns/event-handler/ar.ts +151 -0
  214. package/src/patterns/event-handler/bn.ts +72 -0
  215. package/src/patterns/event-handler/de.ts +117 -0
  216. package/src/patterns/event-handler/en.ts +117 -0
  217. package/src/patterns/event-handler/es.ts +136 -0
  218. package/src/patterns/event-handler/fr.ts +117 -0
  219. package/src/patterns/event-handler/hi.ts +64 -0
  220. package/src/patterns/event-handler/id.ts +117 -0
  221. package/src/patterns/event-handler/index.ts +119 -0
  222. package/src/patterns/event-handler/it.ts +54 -0
  223. package/src/patterns/event-handler/ja.ts +118 -0
  224. package/src/patterns/event-handler/ko.ts +133 -0
  225. package/src/patterns/event-handler/ms.ts +30 -0
  226. package/src/patterns/event-handler/pl.ts +62 -0
  227. package/src/patterns/event-handler/pt.ts +117 -0
  228. package/src/patterns/event-handler/qu.ts +66 -0
  229. package/src/patterns/event-handler/ru.ts +62 -0
  230. package/src/patterns/event-handler/shared.ts +270 -0
  231. package/src/patterns/event-handler/sw.ts +117 -0
  232. package/src/patterns/event-handler/th.ts +53 -0
  233. package/src/patterns/event-handler/tl.ts +30 -0
  234. package/src/patterns/event-handler/tr.ts +170 -0
  235. package/src/patterns/event-handler/uk.ts +62 -0
  236. package/src/patterns/event-handler/vi.ts +61 -0
  237. package/src/patterns/event-handler/zh.ts +150 -0
  238. package/src/patterns/get/ar.ts +49 -0
  239. package/src/patterns/get/bn.ts +47 -0
  240. package/src/patterns/get/de.ts +32 -0
  241. package/src/patterns/get/hi.ts +52 -0
  242. package/src/patterns/get/index.ts +83 -0
  243. package/src/patterns/get/it.ts +56 -0
  244. package/src/patterns/get/ja.ts +53 -0
  245. package/src/patterns/get/ko.ts +53 -0
  246. package/src/patterns/get/ms.ts +30 -0
  247. package/src/patterns/get/pl.ts +57 -0
  248. package/src/patterns/get/ru.ts +57 -0
  249. package/src/patterns/get/th.ts +29 -0
  250. package/src/patterns/get/tl.ts +30 -0
  251. package/src/patterns/get/uk.ts +57 -0
  252. package/src/patterns/get/vi.ts +48 -0
  253. package/src/patterns/grammar-transformed/index.ts +39 -0
  254. package/src/patterns/grammar-transformed/ja.ts +1713 -0
  255. package/src/patterns/grammar-transformed/ko.ts +1311 -0
  256. package/src/patterns/grammar-transformed/tr.ts +1067 -0
  257. package/src/patterns/hide/ar.ts +67 -0
  258. package/src/patterns/hide/bn.ts +47 -0
  259. package/src/patterns/hide/de.ts +36 -0
  260. package/src/patterns/hide/hi.ts +61 -0
  261. package/src/patterns/hide/index.ts +91 -0
  262. package/src/patterns/hide/it.ts +56 -0
  263. package/src/patterns/hide/ja.ts +69 -0
  264. package/src/patterns/hide/ko.ts +69 -0
  265. package/src/patterns/hide/ms.ts +30 -0
  266. package/src/patterns/hide/pl.ts +57 -0
  267. package/src/patterns/hide/ru.ts +57 -0
  268. package/src/patterns/hide/th.ts +29 -0
  269. package/src/patterns/hide/tl.ts +30 -0
  270. package/src/patterns/hide/tr.ts +65 -0
  271. package/src/patterns/hide/uk.ts +57 -0
  272. package/src/patterns/hide/vi.ts +56 -0
  273. package/src/patterns/hide/zh.ts +68 -0
  274. package/src/patterns/increment/bn.ts +70 -0
  275. package/src/patterns/increment/de.ts +36 -0
  276. package/src/patterns/increment/hi.ts +68 -0
  277. package/src/patterns/increment/index.ts +79 -0
  278. package/src/patterns/increment/it.ts +69 -0
  279. package/src/patterns/increment/ms.ts +30 -0
  280. package/src/patterns/increment/pl.ts +58 -0
  281. package/src/patterns/increment/ru.ts +58 -0
  282. package/src/patterns/increment/th.ts +49 -0
  283. package/src/patterns/increment/tl.ts +30 -0
  284. package/src/patterns/increment/tr.ts +52 -0
  285. package/src/patterns/increment/uk.ts +58 -0
  286. package/src/patterns/increment/vi.ts +61 -0
  287. package/src/patterns/increment/zh.ts +32 -0
  288. package/src/patterns/index.ts +84 -0
  289. package/src/patterns/languages/en/control-flow.ts +93 -0
  290. package/src/patterns/languages/en/fetch.ts +62 -0
  291. package/src/patterns/languages/en/index.ts +42 -0
  292. package/src/patterns/languages/en/repeat.ts +67 -0
  293. package/src/patterns/languages/en/set.ts +48 -0
  294. package/src/patterns/languages/en/swap.ts +38 -0
  295. package/src/patterns/languages/en/temporal.ts +57 -0
  296. package/src/patterns/put/ar.ts +74 -0
  297. package/src/patterns/put/bn.ts +53 -0
  298. package/src/patterns/put/en.ts +74 -0
  299. package/src/patterns/put/es.ts +74 -0
  300. package/src/patterns/put/hi.ts +69 -0
  301. package/src/patterns/put/id.ts +96 -0
  302. package/src/patterns/put/index.ts +99 -0
  303. package/src/patterns/put/it.ts +56 -0
  304. package/src/patterns/put/ja.ts +75 -0
  305. package/src/patterns/put/ko.ts +67 -0
  306. package/src/patterns/put/ms.ts +30 -0
  307. package/src/patterns/put/pl.ts +81 -0
  308. package/src/patterns/put/ru.ts +85 -0
  309. package/src/patterns/put/th.ts +32 -0
  310. package/src/patterns/put/tl.ts +30 -0
  311. package/src/patterns/put/tr.ts +67 -0
  312. package/src/patterns/put/uk.ts +85 -0
  313. package/src/patterns/put/vi.ts +72 -0
  314. package/src/patterns/put/zh.ts +62 -0
  315. package/src/patterns/registry.ts +163 -0
  316. package/src/patterns/remove/ar.ts +71 -0
  317. package/src/patterns/remove/bn.ts +68 -0
  318. package/src/patterns/remove/hi.ts +69 -0
  319. package/src/patterns/remove/index.ts +87 -0
  320. package/src/patterns/remove/it.ts +69 -0
  321. package/src/patterns/remove/ja.ts +74 -0
  322. package/src/patterns/remove/ko.ts +78 -0
  323. package/src/patterns/remove/ms.ts +30 -0
  324. package/src/patterns/remove/pl.ts +62 -0
  325. package/src/patterns/remove/ru.ts +62 -0
  326. package/src/patterns/remove/th.ts +49 -0
  327. package/src/patterns/remove/tl.ts +30 -0
  328. package/src/patterns/remove/tr.ts +78 -0
  329. package/src/patterns/remove/uk.ts +62 -0
  330. package/src/patterns/remove/vi.ts +61 -0
  331. package/src/patterns/remove/zh.ts +72 -0
  332. package/src/patterns/set/ar.ts +84 -0
  333. package/src/patterns/set/bn.ts +53 -0
  334. package/src/patterns/set/de.ts +84 -0
  335. package/src/patterns/set/es.ts +92 -0
  336. package/src/patterns/set/fr.ts +88 -0
  337. package/src/patterns/set/hi.ts +56 -0
  338. package/src/patterns/set/id.ts +84 -0
  339. package/src/patterns/set/index.ts +107 -0
  340. package/src/patterns/set/it.ts +56 -0
  341. package/src/patterns/set/ja.ts +86 -0
  342. package/src/patterns/set/ko.ts +85 -0
  343. package/src/patterns/set/ms.ts +30 -0
  344. package/src/patterns/set/pl.ts +57 -0
  345. package/src/patterns/set/pt.ts +84 -0
  346. package/src/patterns/set/ru.ts +57 -0
  347. package/src/patterns/set/th.ts +31 -0
  348. package/src/patterns/set/tl.ts +30 -0
  349. package/src/patterns/set/tr.ts +107 -0
  350. package/src/patterns/set/uk.ts +57 -0
  351. package/src/patterns/set/vi.ts +53 -0
  352. package/src/patterns/set/zh.ts +84 -0
  353. package/src/patterns/show/ar.ts +67 -0
  354. package/src/patterns/show/bn.ts +47 -0
  355. package/src/patterns/show/de.ts +32 -0
  356. package/src/patterns/show/fr.ts +32 -0
  357. package/src/patterns/show/hi.ts +61 -0
  358. package/src/patterns/show/index.ts +95 -0
  359. package/src/patterns/show/it.ts +56 -0
  360. package/src/patterns/show/ja.ts +69 -0
  361. package/src/patterns/show/ko.ts +73 -0
  362. package/src/patterns/show/ms.ts +30 -0
  363. package/src/patterns/show/pl.ts +57 -0
  364. package/src/patterns/show/ru.ts +57 -0
  365. package/src/patterns/show/th.ts +29 -0
  366. package/src/patterns/show/tl.ts +30 -0
  367. package/src/patterns/show/tr.ts +65 -0
  368. package/src/patterns/show/uk.ts +57 -0
  369. package/src/patterns/show/vi.ts +56 -0
  370. package/src/patterns/show/zh.ts +68 -0
  371. package/src/patterns/take/ar.ts +51 -0
  372. package/src/patterns/take/index.ts +31 -0
  373. package/src/patterns/toggle/ar.ts +61 -0
  374. package/src/patterns/toggle/bn.ts +70 -0
  375. package/src/patterns/toggle/en.ts +61 -0
  376. package/src/patterns/toggle/es.ts +61 -0
  377. package/src/patterns/toggle/hi.ts +80 -0
  378. package/src/patterns/toggle/index.ts +95 -0
  379. package/src/patterns/toggle/it.ts +69 -0
  380. package/src/patterns/toggle/ja.ts +156 -0
  381. package/src/patterns/toggle/ko.ts +113 -0
  382. package/src/patterns/toggle/ms.ts +30 -0
  383. package/src/patterns/toggle/pl.ts +62 -0
  384. package/src/patterns/toggle/ru.ts +62 -0
  385. package/src/patterns/toggle/th.ts +50 -0
  386. package/src/patterns/toggle/tl.ts +30 -0
  387. package/src/patterns/toggle/tr.ts +88 -0
  388. package/src/patterns/toggle/uk.ts +62 -0
  389. package/src/patterns/toggle/vi.ts +61 -0
  390. package/src/patterns/toggle/zh.ts +99 -0
  391. package/src/public-api.ts +286 -0
  392. package/src/registry.ts +441 -0
  393. package/src/tokenizers/arabic.ts +723 -0
  394. package/src/tokenizers/base.ts +1300 -0
  395. package/src/tokenizers/bengali.ts +289 -0
  396. package/src/tokenizers/chinese.ts +481 -0
  397. package/src/tokenizers/english.ts +416 -0
  398. package/src/tokenizers/french.ts +326 -0
  399. package/src/tokenizers/german.ts +324 -0
  400. package/src/tokenizers/hindi.ts +319 -0
  401. package/src/tokenizers/index.ts +127 -0
  402. package/src/tokenizers/indonesian.ts +306 -0
  403. package/src/tokenizers/italian.ts +458 -0
  404. package/src/tokenizers/japanese.ts +447 -0
  405. package/src/tokenizers/korean.ts +642 -0
  406. package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
  407. package/src/tokenizers/morphology/french-normalizer.ts +268 -0
  408. package/src/tokenizers/morphology/german-normalizer.ts +256 -0
  409. package/src/tokenizers/morphology/index.ts +46 -0
  410. package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
  411. package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
  412. package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
  413. package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
  414. package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
  415. package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
  416. package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
  417. package/src/tokenizers/morphology/types.ts +211 -0
  418. package/src/tokenizers/ms.ts +198 -0
  419. package/src/tokenizers/polish.ts +354 -0
  420. package/src/tokenizers/portuguese.ts +304 -0
  421. package/src/tokenizers/quechua.ts +339 -0
  422. package/src/tokenizers/russian.ts +375 -0
  423. package/src/tokenizers/spanish.ts +403 -0
  424. package/src/tokenizers/swahili.ts +303 -0
  425. package/src/tokenizers/thai.ts +236 -0
  426. package/src/tokenizers/tl.ts +198 -0
  427. package/src/tokenizers/turkish.ts +411 -0
  428. package/src/tokenizers/ukrainian.ts +369 -0
  429. package/src/tokenizers/vietnamese.ts +410 -0
  430. package/src/types/grammar-types.ts +617 -0
  431. package/src/types/unified-profile.ts +267 -0
  432. package/src/types.ts +709 -0
  433. package/src/utils/confidence-calculator.ts +147 -0
  434. package/src/validators/command-validator.ts +380 -0
  435. package/src/validators/index.ts +15 -0
@@ -0,0 +1,412 @@
1
+ /**
2
+ * Turkish Morphological Normalizer
3
+ *
4
+ * Turkish is a highly agglutinative language with strict vowel harmony.
5
+ * Suffixes attach in sequence and their vowels change based on the last
6
+ * vowel of the stem (front/back, rounded/unrounded).
7
+ *
8
+ * Vowel Harmony Rules:
9
+ * - Back vowels (a, ı, o, u) take back vowel suffixes
10
+ * - Front vowels (e, i, ö, ü) take front vowel suffixes
11
+ *
12
+ * Common verb suffixes:
13
+ * - Infinitive: -mak/-mek (değiştirmek = to change)
14
+ * - Present continuous: -iyor/-ıyor/-üyor/-uyor (değiştiriyor = is changing)
15
+ * - Past: -di/-dı/-dü/-du (değiştirdi = changed)
16
+ * - Reported past: -miş/-mış/-müş/-muş (değiştirmiş = apparently changed)
17
+ * - Future: -ecek/-acak (değiştirecek = will change)
18
+ * - Negation: -me/-ma before tense (değiştirmiyor = is not changing)
19
+ * - Passive: -il/-ıl/-ül/-ul (değiştirildi = was changed)
20
+ * - Causative: -tir/-tır/-tür/-tur (değiştirtmek = to make change)
21
+ *
22
+ * Person suffixes (after tense):
23
+ * - 1sg: -im/-ım/-üm/-um or -m (yapıyorum = I am doing)
24
+ * - 2sg: -sin/-sın/-sün/-sun (yapıyorsun = you are doing)
25
+ * - 3sg: (no suffix) (yapıyor = he/she is doing)
26
+ * - 1pl: -iz/-ız/-üz/-uz (yapıyoruz = we are doing)
27
+ * - 2pl: -siniz/-sınız/-sünüz/-sunuz (yapıyorsunuz = you all are doing)
28
+ * - 3pl: -ler/-lar (yapıyorlar = they are doing)
29
+ *
30
+ * Examples:
31
+ * değiştiriyorum → değiştir (I am changing)
32
+ * değiştirmek → değiştir (to change)
33
+ * gösterdi → göster (showed)
34
+ * gizleniyor → gizle (is being hidden)
35
+ */
36
+
37
+ import type {
38
+ MorphologicalNormalizer,
39
+ NormalizationResult,
40
+ SuffixRule,
41
+ ConjugationType,
42
+ } from './types';
43
+ import { noChange, normalized } from './types';
44
+
45
+ /**
46
+ * Check if a character is a Turkish letter.
47
+ * Turkish uses Latin alphabet with special characters: ç, ğ, ı, ö, ş, ü
48
+ */
49
+ function isTurkishLetter(char: string): boolean {
50
+ const code = char.charCodeAt(0);
51
+ // Basic Latin letters
52
+ if ((code >= 0x41 && code <= 0x5a) || (code >= 0x61 && code <= 0x7a)) {
53
+ return true;
54
+ }
55
+ // Turkish special characters
56
+ const turkishChars = 'çÇğĞıİöÖşŞüÜ';
57
+ return turkishChars.includes(char);
58
+ }
59
+
60
+ /**
61
+ * Check if a word contains Turkish characters (including special chars).
62
+ */
63
+ function containsTurkish(word: string): boolean {
64
+ for (const char of word) {
65
+ if (isTurkishLetter(char)) return true;
66
+ }
67
+ return false;
68
+ }
69
+
70
+ /**
71
+ * Check if a vowel is a back vowel.
72
+ */
73
+ function isBackVowel(char: string): boolean {
74
+ return 'aıouAIOU'.includes(char);
75
+ }
76
+
77
+ /**
78
+ * Check if a vowel is a front vowel.
79
+ */
80
+ function isFrontVowel(char: string): boolean {
81
+ return 'eiöüEİÖÜ'.includes(char);
82
+ }
83
+
84
+ /**
85
+ * Check if a character is a vowel.
86
+ */
87
+ function isVowel(char: string): boolean {
88
+ return isBackVowel(char) || isFrontVowel(char);
89
+ }
90
+
91
+ /**
92
+ * Get the last vowel in a word.
93
+ */
94
+ function getLastVowel(word: string): string | null {
95
+ for (let i = word.length - 1; i >= 0; i--) {
96
+ if (isVowel(word[i])) {
97
+ return word[i];
98
+ }
99
+ }
100
+ return null;
101
+ }
102
+
103
+ /**
104
+ * Check if a suffix matches vowel harmony with the stem.
105
+ * This helps validate that a potential suffix actually belongs.
106
+ */
107
+ function matchesVowelHarmony(stem: string, suffix: string): boolean {
108
+ const stemLastVowel = getLastVowel(stem);
109
+ if (!stemLastVowel) return true; // No vowel in stem, can't validate
110
+
111
+ const suffixFirstVowel = suffix.split('').find(c => isVowel(c));
112
+ if (!suffixFirstVowel) return true; // No vowel in suffix, can't validate
113
+
114
+ // Back vowel stems take back vowel suffixes
115
+ if (isBackVowel(stemLastVowel)) {
116
+ return isBackVowel(suffixFirstVowel);
117
+ }
118
+ // Front vowel stems take front vowel suffixes
119
+ return isFrontVowel(suffixFirstVowel);
120
+ }
121
+
122
+ /**
123
+ * Suffix rules for Turkish verb conjugation.
124
+ * Each pattern includes all vowel harmony variants.
125
+ * Ordered by length (longest first) to ensure greedy matching.
126
+ */
127
+ const TURKISH_SUFFIX_RULES: readonly SuffixRule[] = [
128
+ // Compound tense + person (longest patterns first)
129
+ // Present continuous + person
130
+ { pattern: 'iyorsunuz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
131
+ { pattern: 'ıyorsunuz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
132
+ { pattern: 'üyorsunuz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
133
+ { pattern: 'uyorsunuz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
134
+ { pattern: 'iyorsun', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
135
+ { pattern: 'ıyorsun', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
136
+ { pattern: 'üyorsun', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
137
+ { pattern: 'uyorsun', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
138
+ { pattern: 'iyoruz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
139
+ { pattern: 'ıyoruz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
140
+ { pattern: 'üyoruz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
141
+ { pattern: 'uyoruz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
142
+ { pattern: 'iyorum', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
143
+ { pattern: 'ıyorum', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
144
+ { pattern: 'üyorum', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
145
+ { pattern: 'uyorum', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
146
+ { pattern: 'iyorlar', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
147
+ { pattern: 'ıyorlar', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
148
+ { pattern: 'üyorlar', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
149
+ { pattern: 'uyorlar', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
150
+
151
+ // Future tense + person
152
+ { pattern: 'eceksiniz', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
153
+ { pattern: 'acaksınız', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
154
+ { pattern: 'eceksin', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
155
+ { pattern: 'acaksın', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
156
+ { pattern: 'eceğiz', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
157
+ { pattern: 'acağız', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
158
+ { pattern: 'eceğim', confidence: 0.85, conjugationType: 'future', minStemLength: 2 },
159
+ { pattern: 'acağım', confidence: 0.85, conjugationType: 'future', minStemLength: 2 },
160
+ { pattern: 'ecekler', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
161
+ { pattern: 'acaklar', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
162
+
163
+ // Reported past + person
164
+ { pattern: 'mişsiniz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
165
+ { pattern: 'mışsınız', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
166
+ { pattern: 'müşsünüz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
167
+ { pattern: 'muşsunuz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
168
+ { pattern: 'mişsin', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
169
+ { pattern: 'mışsın', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
170
+ { pattern: 'müşsün', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
171
+ { pattern: 'muşsun', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
172
+ { pattern: 'mişiz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
173
+ { pattern: 'mışız', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
174
+ { pattern: 'müşüz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
175
+ { pattern: 'muşuz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
176
+ { pattern: 'mişim', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
177
+ { pattern: 'mışım', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
178
+ { pattern: 'müşüm', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
179
+ { pattern: 'muşum', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
180
+ { pattern: 'mişler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
181
+ { pattern: 'mışlar', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
182
+ { pattern: 'müşler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
183
+ { pattern: 'muşlar', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
184
+
185
+ // Past tense + person
186
+ { pattern: 'diniz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
187
+ { pattern: 'dınız', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
188
+ { pattern: 'dünüz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
189
+ { pattern: 'dunuz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
190
+ { pattern: 'tiniz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
191
+ { pattern: 'tınız', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
192
+ { pattern: 'tünüz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
193
+ { pattern: 'tunuz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
194
+ { pattern: 'diler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
195
+ { pattern: 'dılar', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
196
+ { pattern: 'düler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
197
+ { pattern: 'dular', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
198
+ { pattern: 'tiler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
199
+ { pattern: 'tılar', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
200
+ { pattern: 'tüler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
201
+ { pattern: 'tular', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
202
+ { pattern: 'din', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
203
+ { pattern: 'dın', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
204
+ { pattern: 'dün', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
205
+ { pattern: 'dun', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
206
+ { pattern: 'tin', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
207
+ { pattern: 'tın', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
208
+ { pattern: 'tün', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
209
+ { pattern: 'tun', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
210
+ { pattern: 'dik', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
211
+ { pattern: 'dık', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
212
+ { pattern: 'dük', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
213
+ { pattern: 'duk', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
214
+ { pattern: 'tik', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
215
+ { pattern: 'tık', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
216
+ { pattern: 'tük', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
217
+ { pattern: 'tuk', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
218
+ { pattern: 'dim', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
219
+ { pattern: 'dım', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
220
+ { pattern: 'düm', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
221
+ { pattern: 'dum', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
222
+ { pattern: 'tim', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
223
+ { pattern: 'tım', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
224
+ { pattern: 'tüm', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
225
+ { pattern: 'tum', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
226
+
227
+ // Present continuous (no person - 3rd person singular)
228
+ { pattern: 'iyor', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
229
+ { pattern: 'ıyor', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
230
+ { pattern: 'üyor', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
231
+ { pattern: 'uyor', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
232
+
233
+ // Future (no person - 3rd person singular)
234
+ { pattern: 'ecek', confidence: 0.85, conjugationType: 'future', minStemLength: 2 },
235
+ { pattern: 'acak', confidence: 0.85, conjugationType: 'future', minStemLength: 2 },
236
+
237
+ // Reported past (no person - 3rd person singular)
238
+ { pattern: 'miş', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
239
+ { pattern: 'mış', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
240
+ { pattern: 'müş', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
241
+ { pattern: 'muş', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
242
+
243
+ // Simple past (no person - 3rd person singular)
244
+ { pattern: 'di', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
245
+ { pattern: 'dı', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
246
+ { pattern: 'dü', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
247
+ { pattern: 'du', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
248
+ { pattern: 'ti', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
249
+ { pattern: 'tı', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
250
+ { pattern: 'tü', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
251
+ { pattern: 'tu', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
252
+
253
+ // Infinitive
254
+ { pattern: 'mek', confidence: 0.88, conjugationType: 'dictionary', minStemLength: 2 },
255
+ { pattern: 'mak', confidence: 0.88, conjugationType: 'dictionary', minStemLength: 2 },
256
+
257
+ // Optative mood (let me/us...) - -eyim/-ayım/-elim/-alım
258
+ { pattern: 'eyelim', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
259
+ { pattern: 'ayalım', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
260
+ { pattern: 'eyim', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
261
+ { pattern: 'ayım', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
262
+ { pattern: 'elim', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
263
+ { pattern: 'alım', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
264
+
265
+ // Necessitative (must/should) - -meli/-malı
266
+ { pattern: 'melisiniz', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
267
+ { pattern: 'malısınız', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
268
+ { pattern: 'melisin', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
269
+ { pattern: 'malısın', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
270
+ { pattern: 'meliyiz', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
271
+ { pattern: 'malıyız', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
272
+ { pattern: 'meliyim', confidence: 0.85, conjugationType: 'necessitative', minStemLength: 2 },
273
+ { pattern: 'malıyım', confidence: 0.85, conjugationType: 'necessitative', minStemLength: 2 },
274
+ { pattern: 'meliler', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
275
+ { pattern: 'malılar', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
276
+ { pattern: 'meli', confidence: 0.85, conjugationType: 'necessitative', minStemLength: 2 },
277
+ { pattern: 'malı', confidence: 0.85, conjugationType: 'necessitative', minStemLength: 2 },
278
+
279
+ // Ability (can) - -ebil/-abil + tense suffixes
280
+ { pattern: 'ebiliyor', confidence: 0.82, conjugationType: 'potential', minStemLength: 2 },
281
+ { pattern: 'abiliyor', confidence: 0.82, conjugationType: 'potential', minStemLength: 2 },
282
+ { pattern: 'ebilir', confidence: 0.85, conjugationType: 'potential', minStemLength: 2 },
283
+ { pattern: 'abilir', confidence: 0.85, conjugationType: 'potential', minStemLength: 2 },
284
+ { pattern: 'ebildi', confidence: 0.82, conjugationType: 'potential', minStemLength: 2 },
285
+ { pattern: 'abildi', confidence: 0.82, conjugationType: 'potential', minStemLength: 2 },
286
+ { pattern: 'ebilmek', confidence: 0.85, conjugationType: 'potential', minStemLength: 2 },
287
+ { pattern: 'abilmek', confidence: 0.85, conjugationType: 'potential', minStemLength: 2 },
288
+
289
+ // Imperative (2nd person singular is just stem, 2nd person plural has suffix)
290
+ { pattern: 'iniz', confidence: 0.82, conjugationType: 'imperative', minStemLength: 2 },
291
+ { pattern: 'ınız', confidence: 0.82, conjugationType: 'imperative', minStemLength: 2 },
292
+ { pattern: 'ünüz', confidence: 0.82, conjugationType: 'imperative', minStemLength: 2 },
293
+ { pattern: 'unuz', confidence: 0.82, conjugationType: 'imperative', minStemLength: 2 },
294
+ { pattern: 'in', confidence: 0.8, conjugationType: 'imperative', minStemLength: 2 },
295
+ { pattern: 'ın', confidence: 0.8, conjugationType: 'imperative', minStemLength: 2 },
296
+ { pattern: 'ün', confidence: 0.8, conjugationType: 'imperative', minStemLength: 2 },
297
+ { pattern: 'un', confidence: 0.8, conjugationType: 'imperative', minStemLength: 2 },
298
+
299
+ // Passive voice
300
+ { pattern: 'ildi', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
301
+ { pattern: 'ıldı', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
302
+ { pattern: 'üldü', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
303
+ { pattern: 'uldu', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
304
+ { pattern: 'ilir', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
305
+ { pattern: 'ılır', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
306
+ { pattern: 'ülür', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
307
+ { pattern: 'ulur', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
308
+
309
+ // Causative
310
+ { pattern: 'tirmek', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
311
+ { pattern: 'tırmak', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
312
+ { pattern: 'türmek', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
313
+ { pattern: 'turmak', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
314
+ { pattern: 'dirmek', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
315
+ { pattern: 'dırmak', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
316
+ { pattern: 'dürmek', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
317
+ { pattern: 'durmak', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
318
+
319
+ // Negation + tense combinations (very common)
320
+ { pattern: 'miyorsunuz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
321
+ { pattern: 'mıyorsunuz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
322
+ { pattern: 'müyorsunuz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
323
+ { pattern: 'muyorsunuz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
324
+ { pattern: 'miyorsun', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
325
+ { pattern: 'mıyorsun', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
326
+ { pattern: 'müyorsun', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
327
+ { pattern: 'muyorsun', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
328
+ { pattern: 'miyoruz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
329
+ { pattern: 'mıyoruz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
330
+ { pattern: 'müyoruz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
331
+ { pattern: 'muyoruz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
332
+ { pattern: 'miyorum', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
333
+ { pattern: 'mıyorum', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
334
+ { pattern: 'müyorum', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
335
+ { pattern: 'muyorum', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
336
+ { pattern: 'miyor', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
337
+ { pattern: 'mıyor', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
338
+ { pattern: 'müyor', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
339
+ { pattern: 'muyor', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
340
+ { pattern: 'medi', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
341
+ { pattern: 'madı', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
342
+ { pattern: 'me', confidence: 0.75, conjugationType: 'negative', minStemLength: 3 },
343
+ { pattern: 'ma', confidence: 0.75, conjugationType: 'negative', minStemLength: 3 },
344
+ ];
345
+
346
+ /**
347
+ * Turkish morphological normalizer.
348
+ */
349
+ export class TurkishMorphologicalNormalizer implements MorphologicalNormalizer {
350
+ readonly language = 'tr';
351
+
352
+ /**
353
+ * Check if a word might be a Turkish verb that can be normalized.
354
+ */
355
+ isNormalizable(word: string): boolean {
356
+ // Must contain Turkish characters
357
+ if (!containsTurkish(word)) return false;
358
+
359
+ // Must be at least 3 characters (Turkish verb stems are usually 2+ chars)
360
+ if (word.length < 3) return false;
361
+
362
+ return true;
363
+ }
364
+
365
+ /**
366
+ * Normalize a Turkish word to its stem form.
367
+ */
368
+ normalize(word: string): NormalizationResult {
369
+ // Convert to lowercase for matching
370
+ const lowerWord = word.toLowerCase();
371
+
372
+ // Try suffix rules
373
+ for (const rule of TURKISH_SUFFIX_RULES) {
374
+ if (lowerWord.endsWith(rule.pattern)) {
375
+ const stem = lowerWord.slice(0, -rule.pattern.length);
376
+
377
+ // Validate stem length
378
+ const minLength = rule.minStemLength ?? 2;
379
+ if (stem.length < minLength) continue;
380
+
381
+ // Validate vowel harmony (optional, can help avoid false positives)
382
+ if (!matchesVowelHarmony(stem, rule.pattern)) {
383
+ // Lower confidence if vowel harmony doesn't match
384
+ // but still allow it since there are exceptions
385
+ const adjustedConfidence = rule.confidence * 0.9;
386
+
387
+ const metadata: { removedSuffixes: string[]; conjugationType?: ConjugationType } = {
388
+ removedSuffixes: [rule.pattern],
389
+ };
390
+ if (rule.conjugationType) {
391
+ metadata.conjugationType = rule.conjugationType;
392
+ }
393
+ return normalized(stem, adjustedConfidence, metadata);
394
+ }
395
+
396
+ const metadata: { removedSuffixes: string[]; conjugationType?: ConjugationType } = {
397
+ removedSuffixes: [rule.pattern],
398
+ };
399
+ if (rule.conjugationType) {
400
+ metadata.conjugationType = rule.conjugationType;
401
+ }
402
+ return normalized(stem, rule.confidence, metadata);
403
+ }
404
+ }
405
+
406
+ // No normalization needed
407
+ return noChange(word);
408
+ }
409
+ }
410
+
411
+ // Export singleton instance
412
+ export const turkishMorphologicalNormalizer = new TurkishMorphologicalNormalizer();
@@ -0,0 +1,211 @@
1
+ /**
2
+ * Morphological Normalizer Types
3
+ *
4
+ * Defines interfaces for language-specific morphological analysis.
5
+ * Normalizers reduce conjugated/inflected forms to canonical stems
6
+ * that can be matched against keyword dictionaries.
7
+ */
8
+
9
+ /**
10
+ * Result of morphological normalization.
11
+ */
12
+ export interface NormalizationResult {
13
+ /** The extracted stem/root form */
14
+ readonly stem: string;
15
+
16
+ /** Confidence in the normalization (0.0-1.0) */
17
+ readonly confidence: number;
18
+
19
+ /** Optional metadata about the transformation */
20
+ readonly metadata?: NormalizationMetadata;
21
+ }
22
+
23
+ /**
24
+ * Metadata about morphological transformations applied.
25
+ */
26
+ export interface NormalizationMetadata {
27
+ /** Prefixes that were removed */
28
+ readonly removedPrefixes?: readonly string[];
29
+
30
+ /** Suffixes that were removed */
31
+ readonly removedSuffixes?: readonly string[];
32
+
33
+ /** Type of conjugation detected */
34
+ readonly conjugationType?: ConjugationType;
35
+
36
+ /** Original form classification */
37
+ readonly originalForm?: string;
38
+
39
+ /** Applied transformation rules (for debugging) */
40
+ readonly appliedRules?: readonly string[];
41
+ }
42
+
43
+ /**
44
+ * Types of verb conjugation/inflection.
45
+ */
46
+ export type ConjugationType =
47
+ // Tense
48
+ | 'present'
49
+ | 'past'
50
+ | 'future'
51
+ | 'progressive'
52
+ | 'perfect'
53
+ // Mood
54
+ | 'imperative'
55
+ | 'subjunctive'
56
+ | 'conditional'
57
+ // Voice
58
+ | 'passive'
59
+ | 'causative'
60
+ // Politeness (Japanese/Korean)
61
+ | 'polite'
62
+ | 'humble'
63
+ | 'honorific'
64
+ // Form
65
+ | 'negative'
66
+ | 'potential'
67
+ | 'volitional'
68
+ // Japanese conditional forms
69
+ | 'conditional-tara' // たら/したら - if/when (completed action)
70
+ | 'conditional-to' // と/すると - when (habitual/expected)
71
+ | 'conditional-ba' // ば/すれば - if (hypothetical)
72
+ // Korean-specific
73
+ | 'connective' // 하고, 해서 etc.
74
+ | 'conditional-myeon' // -(으)면 - if/when (general conditional)
75
+ | 'temporal-ttae' // -(으)ㄹ 때 - when (at the time of)
76
+ | 'causal-nikka' // -(으)니까 - because/since
77
+ // Korean honorific forms (-시- infix)
78
+ | 'honorific-conditional' // -하시면 - if (honorific)
79
+ | 'honorific-temporal' // -하실 때 - when (honorific)
80
+ | 'honorific-causal' // -하시니까 - because (honorific)
81
+ | 'honorific-past' // -하셨어요 - past (honorific)
82
+ | 'honorific-polite' // -하십니다 - polite (honorific)
83
+ // Korean sequential forms
84
+ | 'sequential-after' // -고 나서 - after doing
85
+ | 'sequential-before' // -기 전에 - before doing
86
+ | 'immediate' // -자마자 - as soon as
87
+ | 'obligation' // -아야/어야 해 - must do, should do
88
+ // Spanish-specific
89
+ | 'reflexive'
90
+ | 'reflexive-imperative'
91
+ | 'gerund'
92
+ | 'participle'
93
+ // Arabic-specific
94
+ | 'conditional-idha' // إذا - if/when (hypothetical)
95
+ | 'temporal-indama' // عندما - when (temporal conjunction)
96
+ | 'temporal-hina' // حين - at the time of
97
+ | 'temporal-lamma' // لمّا - when (past emphasis)
98
+ | 'past-verb' // فعل ماضي - past tense verb
99
+ // Turkish-specific
100
+ | 'conditional-se' // -se/-sa - if (hypothetical)
101
+ | 'temporal-ince' // -ince/-ınca/-unca/-ünce - when/as
102
+ | 'temporal-dikce' // -dikçe/-dıkça/-dukça/-dükçe - as/while
103
+ | 'aorist' // -ir/-ar - habitual/general
104
+ | 'optative' // -eyim/-ayım/-elim/-alım - let me/us
105
+ | 'necessitative' // -meli/-malı - must/should
106
+ // Japanese request/contracted forms
107
+ | 'request' // てください/でください - polite request
108
+ | 'casual-request' // てくれ/でくれ - casual request
109
+ | 'contracted' // ちゃう/じゃう - contracted completion (てしまう)
110
+ | 'contracted-past' // ちゃった/じゃった - contracted past completion
111
+ // Compound
112
+ | 'compound' // Multi-layer suffixes (ていなかった, 하고나서였어)
113
+ | 'te-form' // Japanese て-form
114
+ | 'dictionary'; // Base/infinitive form
115
+
116
+ /**
117
+ * Interface for language-specific morphological normalizers.
118
+ *
119
+ * Normalizers attempt to reduce inflected word forms to their
120
+ * canonical stems. This enables matching conjugated verbs against
121
+ * keyword dictionaries that only contain base forms.
122
+ *
123
+ * Example (Japanese):
124
+ * 切り替えた (past) → { stem: '切り替え', confidence: 0.85 }
125
+ * 切り替えます (polite) → { stem: '切り替え', confidence: 0.85 }
126
+ *
127
+ * Example (Spanish):
128
+ * mostrarse (reflexive infinitive) → { stem: 'mostrar', confidence: 0.85 }
129
+ * alternando (gerund) → { stem: 'alternar', confidence: 0.85 }
130
+ */
131
+ export interface MorphologicalNormalizer {
132
+ /** Language code this normalizer handles */
133
+ readonly language: string;
134
+
135
+ /**
136
+ * Normalize a word to its canonical stem form.
137
+ *
138
+ * @param word - The word to normalize
139
+ * @returns Normalization result with stem and confidence
140
+ */
141
+ normalize(word: string): NormalizationResult;
142
+
143
+ /**
144
+ * Check if a word appears to be a verb form that can be normalized.
145
+ * Optional optimization to skip normalization for non-verb tokens.
146
+ *
147
+ * @param word - The word to check
148
+ * @returns true if the word might be a normalizable verb form
149
+ */
150
+ isNormalizable?(word: string): boolean;
151
+ }
152
+
153
+ /**
154
+ * Configuration for suffix-based normalization rules.
155
+ * Used by agglutinative languages (Japanese, Korean, Turkish).
156
+ */
157
+ export interface SuffixRule {
158
+ /** The suffix pattern to match */
159
+ readonly pattern: string;
160
+
161
+ /** Confidence when this suffix is stripped */
162
+ readonly confidence: number;
163
+
164
+ /** What to replace the suffix with (empty string for simple removal) */
165
+ readonly replacement?: string;
166
+
167
+ /** Conjugation type this suffix indicates */
168
+ readonly conjugationType?: ConjugationType;
169
+
170
+ /** Minimum stem length after stripping (to avoid over-stripping) */
171
+ readonly minStemLength?: number;
172
+ }
173
+
174
+ /**
175
+ * Configuration for prefix-based normalization rules.
176
+ * Used primarily by Arabic for article/conjunction prefixes.
177
+ */
178
+ export interface PrefixRule {
179
+ /** The prefix pattern to match */
180
+ readonly pattern: string;
181
+
182
+ /** Confidence penalty when this prefix is stripped */
183
+ readonly confidencePenalty: number;
184
+
185
+ /** What the prefix indicates (for metadata) */
186
+ readonly prefixType?: 'article' | 'conjunction' | 'preposition' | 'verb-marker';
187
+
188
+ /** Minimum remaining characters after stripping (to avoid over-stripping) */
189
+ readonly minRemaining?: number;
190
+ }
191
+
192
+ /**
193
+ * Helper to create a "no change" normalization result.
194
+ */
195
+ export function noChange(word: string): NormalizationResult {
196
+ return { stem: word, confidence: 1.0 };
197
+ }
198
+
199
+ /**
200
+ * Helper to create a normalization result with metadata.
201
+ */
202
+ export function normalized(
203
+ stem: string,
204
+ confidence: number,
205
+ metadata?: NormalizationMetadata
206
+ ): NormalizationResult {
207
+ if (metadata) {
208
+ return { stem, confidence, metadata };
209
+ }
210
+ return { stem, confidence };
211
+ }