@lokascript/semantic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +686 -0
  3. package/dist/browser-ar.ar.global.js +2 -0
  4. package/dist/browser-core.core.global.js +2 -0
  5. package/dist/browser-de.de.global.js +2 -0
  6. package/dist/browser-east-asian.east-asian.global.js +2 -0
  7. package/dist/browser-en-tr.en-tr.global.js +2 -0
  8. package/dist/browser-en.en.global.js +2 -0
  9. package/dist/browser-es-en.es-en.global.js +2 -0
  10. package/dist/browser-es.es.global.js +2 -0
  11. package/dist/browser-fr.fr.global.js +2 -0
  12. package/dist/browser-id.id.global.js +2 -0
  13. package/dist/browser-ja.ja.global.js +2 -0
  14. package/dist/browser-ko.ko.global.js +2 -0
  15. package/dist/browser-lazy.lazy.global.js +2 -0
  16. package/dist/browser-priority.priority.global.js +2 -0
  17. package/dist/browser-pt.pt.global.js +2 -0
  18. package/dist/browser-qu.qu.global.js +2 -0
  19. package/dist/browser-sw.sw.global.js +2 -0
  20. package/dist/browser-tr.tr.global.js +2 -0
  21. package/dist/browser-western.western.global.js +2 -0
  22. package/dist/browser-zh.zh.global.js +2 -0
  23. package/dist/browser.global.js +3 -0
  24. package/dist/browser.global.js.map +1 -0
  25. package/dist/index.cjs +35051 -0
  26. package/dist/index.cjs.map +1 -0
  27. package/dist/index.d.cts +3426 -0
  28. package/dist/index.d.ts +3426 -0
  29. package/dist/index.js +34890 -0
  30. package/dist/index.js.map +1 -0
  31. package/dist/languages/ar.d.ts +78 -0
  32. package/dist/languages/ar.js +1622 -0
  33. package/dist/languages/ar.js.map +1 -0
  34. package/dist/languages/de.d.ts +38 -0
  35. package/dist/languages/de.js +1168 -0
  36. package/dist/languages/de.js.map +1 -0
  37. package/dist/languages/en.d.ts +44 -0
  38. package/dist/languages/en.js +3491 -0
  39. package/dist/languages/en.js.map +1 -0
  40. package/dist/languages/es.d.ts +52 -0
  41. package/dist/languages/es.js +1493 -0
  42. package/dist/languages/es.js.map +1 -0
  43. package/dist/languages/fr.d.ts +37 -0
  44. package/dist/languages/fr.js +1159 -0
  45. package/dist/languages/fr.js.map +1 -0
  46. package/dist/languages/id.d.ts +35 -0
  47. package/dist/languages/id.js +1152 -0
  48. package/dist/languages/id.js.map +1 -0
  49. package/dist/languages/ja.d.ts +53 -0
  50. package/dist/languages/ja.js +1430 -0
  51. package/dist/languages/ja.js.map +1 -0
  52. package/dist/languages/ko.d.ts +51 -0
  53. package/dist/languages/ko.js +1729 -0
  54. package/dist/languages/ko.js.map +1 -0
  55. package/dist/languages/pt.d.ts +37 -0
  56. package/dist/languages/pt.js +1127 -0
  57. package/dist/languages/pt.js.map +1 -0
  58. package/dist/languages/qu.d.ts +36 -0
  59. package/dist/languages/qu.js +1143 -0
  60. package/dist/languages/qu.js.map +1 -0
  61. package/dist/languages/sw.d.ts +35 -0
  62. package/dist/languages/sw.js +1147 -0
  63. package/dist/languages/sw.js.map +1 -0
  64. package/dist/languages/tr.d.ts +45 -0
  65. package/dist/languages/tr.js +1529 -0
  66. package/dist/languages/tr.js.map +1 -0
  67. package/dist/languages/zh.d.ts +58 -0
  68. package/dist/languages/zh.js +1257 -0
  69. package/dist/languages/zh.js.map +1 -0
  70. package/dist/types-C4dcj53L.d.ts +600 -0
  71. package/package.json +202 -0
  72. package/src/__test-utils__/index.ts +7 -0
  73. package/src/__test-utils__/test-helpers.ts +8 -0
  74. package/src/__types__/test-helpers.ts +122 -0
  75. package/src/analysis/index.ts +479 -0
  76. package/src/ast-builder/command-mappers.ts +1133 -0
  77. package/src/ast-builder/expression-parser/index.ts +41 -0
  78. package/src/ast-builder/expression-parser/parser.ts +563 -0
  79. package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
  80. package/src/ast-builder/expression-parser/types.ts +208 -0
  81. package/src/ast-builder/index.ts +536 -0
  82. package/src/ast-builder/value-converters.ts +172 -0
  83. package/src/bridge.ts +275 -0
  84. package/src/browser-ar.ts +162 -0
  85. package/src/browser-core.ts +231 -0
  86. package/src/browser-de.ts +162 -0
  87. package/src/browser-east-asian.ts +173 -0
  88. package/src/browser-en-tr.ts +165 -0
  89. package/src/browser-en.ts +157 -0
  90. package/src/browser-es-en.ts +200 -0
  91. package/src/browser-es.ts +170 -0
  92. package/src/browser-fr.ts +162 -0
  93. package/src/browser-id.ts +162 -0
  94. package/src/browser-ja.ts +162 -0
  95. package/src/browser-ko.ts +162 -0
  96. package/src/browser-lazy.ts +189 -0
  97. package/src/browser-priority.ts +214 -0
  98. package/src/browser-pt.ts +162 -0
  99. package/src/browser-qu.ts +162 -0
  100. package/src/browser-sw.ts +162 -0
  101. package/src/browser-tr.ts +162 -0
  102. package/src/browser-western.ts +181 -0
  103. package/src/browser-zh.ts +162 -0
  104. package/src/browser.ts +268 -0
  105. package/src/cache/index.ts +14 -0
  106. package/src/cache/semantic-cache.ts +344 -0
  107. package/src/core-bridge.ts +372 -0
  108. package/src/explicit/converter.ts +258 -0
  109. package/src/explicit/index.ts +18 -0
  110. package/src/explicit/parser.ts +236 -0
  111. package/src/explicit/renderer.ts +424 -0
  112. package/src/generators/command-schemas.ts +1636 -0
  113. package/src/generators/event-handler-generator.ts +109 -0
  114. package/src/generators/index.ts +117 -0
  115. package/src/generators/language-profiles.ts +139 -0
  116. package/src/generators/pattern-generator.ts +537 -0
  117. package/src/generators/profiles/arabic.ts +131 -0
  118. package/src/generators/profiles/bengali.ts +132 -0
  119. package/src/generators/profiles/chinese.ts +124 -0
  120. package/src/generators/profiles/english.ts +113 -0
  121. package/src/generators/profiles/french.ts +125 -0
  122. package/src/generators/profiles/german.ts +126 -0
  123. package/src/generators/profiles/hindi.ts +146 -0
  124. package/src/generators/profiles/index.ts +46 -0
  125. package/src/generators/profiles/indonesian.ts +125 -0
  126. package/src/generators/profiles/italian.ts +139 -0
  127. package/src/generators/profiles/japanese.ts +149 -0
  128. package/src/generators/profiles/korean.ts +127 -0
  129. package/src/generators/profiles/marker-templates.ts +288 -0
  130. package/src/generators/profiles/ms.ts +130 -0
  131. package/src/generators/profiles/polish.ts +249 -0
  132. package/src/generators/profiles/portuguese.ts +115 -0
  133. package/src/generators/profiles/quechua.ts +113 -0
  134. package/src/generators/profiles/russian.ts +260 -0
  135. package/src/generators/profiles/spanish.ts +130 -0
  136. package/src/generators/profiles/swahili.ts +129 -0
  137. package/src/generators/profiles/thai.ts +132 -0
  138. package/src/generators/profiles/tl.ts +128 -0
  139. package/src/generators/profiles/turkish.ts +124 -0
  140. package/src/generators/profiles/types.ts +165 -0
  141. package/src/generators/profiles/ukrainian.ts +270 -0
  142. package/src/generators/profiles/vietnamese.ts +133 -0
  143. package/src/generators/schema-error-codes.ts +160 -0
  144. package/src/generators/schema-validator.ts +391 -0
  145. package/src/index.ts +429 -0
  146. package/src/language-building-schema.ts +3170 -0
  147. package/src/language-loader.ts +394 -0
  148. package/src/languages/_all.ts +65 -0
  149. package/src/languages/ar.ts +15 -0
  150. package/src/languages/bn.ts +16 -0
  151. package/src/languages/de.ts +15 -0
  152. package/src/languages/en.ts +29 -0
  153. package/src/languages/es.ts +15 -0
  154. package/src/languages/fr.ts +15 -0
  155. package/src/languages/hi.ts +26 -0
  156. package/src/languages/id.ts +15 -0
  157. package/src/languages/index.ts +18 -0
  158. package/src/languages/it.ts +15 -0
  159. package/src/languages/ja.ts +15 -0
  160. package/src/languages/ko.ts +15 -0
  161. package/src/languages/ms.ts +16 -0
  162. package/src/languages/pl.ts +18 -0
  163. package/src/languages/pt.ts +15 -0
  164. package/src/languages/qu.ts +15 -0
  165. package/src/languages/ru.ts +26 -0
  166. package/src/languages/sw.ts +15 -0
  167. package/src/languages/th.ts +16 -0
  168. package/src/languages/tl.ts +16 -0
  169. package/src/languages/tr.ts +15 -0
  170. package/src/languages/uk.ts +26 -0
  171. package/src/languages/vi.ts +16 -0
  172. package/src/languages/zh.ts +15 -0
  173. package/src/parser/index.ts +15 -0
  174. package/src/parser/pattern-matcher.ts +1181 -0
  175. package/src/parser/semantic-parser.ts +573 -0
  176. package/src/parser/utils/index.ts +35 -0
  177. package/src/parser/utils/marker-resolution.ts +111 -0
  178. package/src/parser/utils/possessive-keywords.ts +43 -0
  179. package/src/parser/utils/role-positioning.ts +70 -0
  180. package/src/parser/utils/type-validation.ts +134 -0
  181. package/src/patterns/add/ar.ts +71 -0
  182. package/src/patterns/add/bn.ts +70 -0
  183. package/src/patterns/add/hi.ts +69 -0
  184. package/src/patterns/add/index.ts +87 -0
  185. package/src/patterns/add/it.ts +61 -0
  186. package/src/patterns/add/ja.ts +93 -0
  187. package/src/patterns/add/ko.ts +74 -0
  188. package/src/patterns/add/ms.ts +30 -0
  189. package/src/patterns/add/pl.ts +62 -0
  190. package/src/patterns/add/ru.ts +62 -0
  191. package/src/patterns/add/th.ts +49 -0
  192. package/src/patterns/add/tl.ts +30 -0
  193. package/src/patterns/add/tr.ts +71 -0
  194. package/src/patterns/add/uk.ts +62 -0
  195. package/src/patterns/add/vi.ts +61 -0
  196. package/src/patterns/add/zh.ts +71 -0
  197. package/src/patterns/builders.ts +207 -0
  198. package/src/patterns/decrement/bn.ts +70 -0
  199. package/src/patterns/decrement/de.ts +42 -0
  200. package/src/patterns/decrement/hi.ts +68 -0
  201. package/src/patterns/decrement/index.ts +79 -0
  202. package/src/patterns/decrement/it.ts +69 -0
  203. package/src/patterns/decrement/ms.ts +30 -0
  204. package/src/patterns/decrement/pl.ts +58 -0
  205. package/src/patterns/decrement/ru.ts +58 -0
  206. package/src/patterns/decrement/th.ts +49 -0
  207. package/src/patterns/decrement/tl.ts +30 -0
  208. package/src/patterns/decrement/tr.ts +48 -0
  209. package/src/patterns/decrement/uk.ts +58 -0
  210. package/src/patterns/decrement/vi.ts +61 -0
  211. package/src/patterns/decrement/zh.ts +32 -0
  212. package/src/patterns/en.ts +302 -0
  213. package/src/patterns/event-handler/ar.ts +151 -0
  214. package/src/patterns/event-handler/bn.ts +72 -0
  215. package/src/patterns/event-handler/de.ts +117 -0
  216. package/src/patterns/event-handler/en.ts +117 -0
  217. package/src/patterns/event-handler/es.ts +136 -0
  218. package/src/patterns/event-handler/fr.ts +117 -0
  219. package/src/patterns/event-handler/hi.ts +64 -0
  220. package/src/patterns/event-handler/id.ts +117 -0
  221. package/src/patterns/event-handler/index.ts +119 -0
  222. package/src/patterns/event-handler/it.ts +54 -0
  223. package/src/patterns/event-handler/ja.ts +118 -0
  224. package/src/patterns/event-handler/ko.ts +133 -0
  225. package/src/patterns/event-handler/ms.ts +30 -0
  226. package/src/patterns/event-handler/pl.ts +62 -0
  227. package/src/patterns/event-handler/pt.ts +117 -0
  228. package/src/patterns/event-handler/qu.ts +66 -0
  229. package/src/patterns/event-handler/ru.ts +62 -0
  230. package/src/patterns/event-handler/shared.ts +270 -0
  231. package/src/patterns/event-handler/sw.ts +117 -0
  232. package/src/patterns/event-handler/th.ts +53 -0
  233. package/src/patterns/event-handler/tl.ts +30 -0
  234. package/src/patterns/event-handler/tr.ts +170 -0
  235. package/src/patterns/event-handler/uk.ts +62 -0
  236. package/src/patterns/event-handler/vi.ts +61 -0
  237. package/src/patterns/event-handler/zh.ts +150 -0
  238. package/src/patterns/get/ar.ts +49 -0
  239. package/src/patterns/get/bn.ts +47 -0
  240. package/src/patterns/get/de.ts +32 -0
  241. package/src/patterns/get/hi.ts +52 -0
  242. package/src/patterns/get/index.ts +83 -0
  243. package/src/patterns/get/it.ts +56 -0
  244. package/src/patterns/get/ja.ts +53 -0
  245. package/src/patterns/get/ko.ts +53 -0
  246. package/src/patterns/get/ms.ts +30 -0
  247. package/src/patterns/get/pl.ts +57 -0
  248. package/src/patterns/get/ru.ts +57 -0
  249. package/src/patterns/get/th.ts +29 -0
  250. package/src/patterns/get/tl.ts +30 -0
  251. package/src/patterns/get/uk.ts +57 -0
  252. package/src/patterns/get/vi.ts +48 -0
  253. package/src/patterns/grammar-transformed/index.ts +39 -0
  254. package/src/patterns/grammar-transformed/ja.ts +1713 -0
  255. package/src/patterns/grammar-transformed/ko.ts +1311 -0
  256. package/src/patterns/grammar-transformed/tr.ts +1067 -0
  257. package/src/patterns/hide/ar.ts +67 -0
  258. package/src/patterns/hide/bn.ts +47 -0
  259. package/src/patterns/hide/de.ts +36 -0
  260. package/src/patterns/hide/hi.ts +61 -0
  261. package/src/patterns/hide/index.ts +91 -0
  262. package/src/patterns/hide/it.ts +56 -0
  263. package/src/patterns/hide/ja.ts +69 -0
  264. package/src/patterns/hide/ko.ts +69 -0
  265. package/src/patterns/hide/ms.ts +30 -0
  266. package/src/patterns/hide/pl.ts +57 -0
  267. package/src/patterns/hide/ru.ts +57 -0
  268. package/src/patterns/hide/th.ts +29 -0
  269. package/src/patterns/hide/tl.ts +30 -0
  270. package/src/patterns/hide/tr.ts +65 -0
  271. package/src/patterns/hide/uk.ts +57 -0
  272. package/src/patterns/hide/vi.ts +56 -0
  273. package/src/patterns/hide/zh.ts +68 -0
  274. package/src/patterns/increment/bn.ts +70 -0
  275. package/src/patterns/increment/de.ts +36 -0
  276. package/src/patterns/increment/hi.ts +68 -0
  277. package/src/patterns/increment/index.ts +79 -0
  278. package/src/patterns/increment/it.ts +69 -0
  279. package/src/patterns/increment/ms.ts +30 -0
  280. package/src/patterns/increment/pl.ts +58 -0
  281. package/src/patterns/increment/ru.ts +58 -0
  282. package/src/patterns/increment/th.ts +49 -0
  283. package/src/patterns/increment/tl.ts +30 -0
  284. package/src/patterns/increment/tr.ts +52 -0
  285. package/src/patterns/increment/uk.ts +58 -0
  286. package/src/patterns/increment/vi.ts +61 -0
  287. package/src/patterns/increment/zh.ts +32 -0
  288. package/src/patterns/index.ts +84 -0
  289. package/src/patterns/languages/en/control-flow.ts +93 -0
  290. package/src/patterns/languages/en/fetch.ts +62 -0
  291. package/src/patterns/languages/en/index.ts +42 -0
  292. package/src/patterns/languages/en/repeat.ts +67 -0
  293. package/src/patterns/languages/en/set.ts +48 -0
  294. package/src/patterns/languages/en/swap.ts +38 -0
  295. package/src/patterns/languages/en/temporal.ts +57 -0
  296. package/src/patterns/put/ar.ts +74 -0
  297. package/src/patterns/put/bn.ts +53 -0
  298. package/src/patterns/put/en.ts +74 -0
  299. package/src/patterns/put/es.ts +74 -0
  300. package/src/patterns/put/hi.ts +69 -0
  301. package/src/patterns/put/id.ts +96 -0
  302. package/src/patterns/put/index.ts +99 -0
  303. package/src/patterns/put/it.ts +56 -0
  304. package/src/patterns/put/ja.ts +75 -0
  305. package/src/patterns/put/ko.ts +67 -0
  306. package/src/patterns/put/ms.ts +30 -0
  307. package/src/patterns/put/pl.ts +81 -0
  308. package/src/patterns/put/ru.ts +85 -0
  309. package/src/patterns/put/th.ts +32 -0
  310. package/src/patterns/put/tl.ts +30 -0
  311. package/src/patterns/put/tr.ts +67 -0
  312. package/src/patterns/put/uk.ts +85 -0
  313. package/src/patterns/put/vi.ts +72 -0
  314. package/src/patterns/put/zh.ts +62 -0
  315. package/src/patterns/registry.ts +163 -0
  316. package/src/patterns/remove/ar.ts +71 -0
  317. package/src/patterns/remove/bn.ts +68 -0
  318. package/src/patterns/remove/hi.ts +69 -0
  319. package/src/patterns/remove/index.ts +87 -0
  320. package/src/patterns/remove/it.ts +69 -0
  321. package/src/patterns/remove/ja.ts +74 -0
  322. package/src/patterns/remove/ko.ts +78 -0
  323. package/src/patterns/remove/ms.ts +30 -0
  324. package/src/patterns/remove/pl.ts +62 -0
  325. package/src/patterns/remove/ru.ts +62 -0
  326. package/src/patterns/remove/th.ts +49 -0
  327. package/src/patterns/remove/tl.ts +30 -0
  328. package/src/patterns/remove/tr.ts +78 -0
  329. package/src/patterns/remove/uk.ts +62 -0
  330. package/src/patterns/remove/vi.ts +61 -0
  331. package/src/patterns/remove/zh.ts +72 -0
  332. package/src/patterns/set/ar.ts +84 -0
  333. package/src/patterns/set/bn.ts +53 -0
  334. package/src/patterns/set/de.ts +84 -0
  335. package/src/patterns/set/es.ts +92 -0
  336. package/src/patterns/set/fr.ts +88 -0
  337. package/src/patterns/set/hi.ts +56 -0
  338. package/src/patterns/set/id.ts +84 -0
  339. package/src/patterns/set/index.ts +107 -0
  340. package/src/patterns/set/it.ts +56 -0
  341. package/src/patterns/set/ja.ts +86 -0
  342. package/src/patterns/set/ko.ts +85 -0
  343. package/src/patterns/set/ms.ts +30 -0
  344. package/src/patterns/set/pl.ts +57 -0
  345. package/src/patterns/set/pt.ts +84 -0
  346. package/src/patterns/set/ru.ts +57 -0
  347. package/src/patterns/set/th.ts +31 -0
  348. package/src/patterns/set/tl.ts +30 -0
  349. package/src/patterns/set/tr.ts +107 -0
  350. package/src/patterns/set/uk.ts +57 -0
  351. package/src/patterns/set/vi.ts +53 -0
  352. package/src/patterns/set/zh.ts +84 -0
  353. package/src/patterns/show/ar.ts +67 -0
  354. package/src/patterns/show/bn.ts +47 -0
  355. package/src/patterns/show/de.ts +32 -0
  356. package/src/patterns/show/fr.ts +32 -0
  357. package/src/patterns/show/hi.ts +61 -0
  358. package/src/patterns/show/index.ts +95 -0
  359. package/src/patterns/show/it.ts +56 -0
  360. package/src/patterns/show/ja.ts +69 -0
  361. package/src/patterns/show/ko.ts +73 -0
  362. package/src/patterns/show/ms.ts +30 -0
  363. package/src/patterns/show/pl.ts +57 -0
  364. package/src/patterns/show/ru.ts +57 -0
  365. package/src/patterns/show/th.ts +29 -0
  366. package/src/patterns/show/tl.ts +30 -0
  367. package/src/patterns/show/tr.ts +65 -0
  368. package/src/patterns/show/uk.ts +57 -0
  369. package/src/patterns/show/vi.ts +56 -0
  370. package/src/patterns/show/zh.ts +68 -0
  371. package/src/patterns/take/ar.ts +51 -0
  372. package/src/patterns/take/index.ts +31 -0
  373. package/src/patterns/toggle/ar.ts +61 -0
  374. package/src/patterns/toggle/bn.ts +70 -0
  375. package/src/patterns/toggle/en.ts +61 -0
  376. package/src/patterns/toggle/es.ts +61 -0
  377. package/src/patterns/toggle/hi.ts +80 -0
  378. package/src/patterns/toggle/index.ts +95 -0
  379. package/src/patterns/toggle/it.ts +69 -0
  380. package/src/patterns/toggle/ja.ts +156 -0
  381. package/src/patterns/toggle/ko.ts +113 -0
  382. package/src/patterns/toggle/ms.ts +30 -0
  383. package/src/patterns/toggle/pl.ts +62 -0
  384. package/src/patterns/toggle/ru.ts +62 -0
  385. package/src/patterns/toggle/th.ts +50 -0
  386. package/src/patterns/toggle/tl.ts +30 -0
  387. package/src/patterns/toggle/tr.ts +88 -0
  388. package/src/patterns/toggle/uk.ts +62 -0
  389. package/src/patterns/toggle/vi.ts +61 -0
  390. package/src/patterns/toggle/zh.ts +99 -0
  391. package/src/public-api.ts +286 -0
  392. package/src/registry.ts +441 -0
  393. package/src/tokenizers/arabic.ts +723 -0
  394. package/src/tokenizers/base.ts +1300 -0
  395. package/src/tokenizers/bengali.ts +289 -0
  396. package/src/tokenizers/chinese.ts +481 -0
  397. package/src/tokenizers/english.ts +416 -0
  398. package/src/tokenizers/french.ts +326 -0
  399. package/src/tokenizers/german.ts +324 -0
  400. package/src/tokenizers/hindi.ts +319 -0
  401. package/src/tokenizers/index.ts +127 -0
  402. package/src/tokenizers/indonesian.ts +306 -0
  403. package/src/tokenizers/italian.ts +458 -0
  404. package/src/tokenizers/japanese.ts +447 -0
  405. package/src/tokenizers/korean.ts +642 -0
  406. package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
  407. package/src/tokenizers/morphology/french-normalizer.ts +268 -0
  408. package/src/tokenizers/morphology/german-normalizer.ts +256 -0
  409. package/src/tokenizers/morphology/index.ts +46 -0
  410. package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
  411. package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
  412. package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
  413. package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
  414. package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
  415. package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
  416. package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
  417. package/src/tokenizers/morphology/types.ts +211 -0
  418. package/src/tokenizers/ms.ts +198 -0
  419. package/src/tokenizers/polish.ts +354 -0
  420. package/src/tokenizers/portuguese.ts +304 -0
  421. package/src/tokenizers/quechua.ts +339 -0
  422. package/src/tokenizers/russian.ts +375 -0
  423. package/src/tokenizers/spanish.ts +403 -0
  424. package/src/tokenizers/swahili.ts +303 -0
  425. package/src/tokenizers/thai.ts +236 -0
  426. package/src/tokenizers/tl.ts +198 -0
  427. package/src/tokenizers/turkish.ts +411 -0
  428. package/src/tokenizers/ukrainian.ts +369 -0
  429. package/src/tokenizers/vietnamese.ts +410 -0
  430. package/src/types/grammar-types.ts +617 -0
  431. package/src/types/unified-profile.ts +267 -0
  432. package/src/types.ts +709 -0
  433. package/src/utils/confidence-calculator.ts +147 -0
  434. package/src/validators/command-validator.ts +380 -0
  435. package/src/validators/index.ts +15 -0
@@ -0,0 +1,1729 @@
1
+ // src/registry.ts
2
+ var tokenizers = /* @__PURE__ */ new Map();
3
+ var profiles = /* @__PURE__ */ new Map();
4
+ var patternCache = /* @__PURE__ */ new Map();
5
+ function registerLanguage(code, tokenizer, profile) {
6
+ tokenizers.set(code, tokenizer);
7
+ profiles.set(code, profile);
8
+ patternCache.delete(code);
9
+ }
10
+
11
+ // src/tokenizers/base.ts
12
+ var TokenStreamImpl = class {
13
+ constructor(tokens, language) {
14
+ this.pos = 0;
15
+ this.tokens = tokens;
16
+ this.language = language;
17
+ }
18
+ peek(offset = 0) {
19
+ const index = this.pos + offset;
20
+ if (index < 0 || index >= this.tokens.length) {
21
+ return null;
22
+ }
23
+ return this.tokens[index];
24
+ }
25
+ advance() {
26
+ if (this.isAtEnd()) {
27
+ throw new Error("Unexpected end of token stream");
28
+ }
29
+ return this.tokens[this.pos++];
30
+ }
31
+ isAtEnd() {
32
+ return this.pos >= this.tokens.length;
33
+ }
34
+ mark() {
35
+ return { position: this.pos };
36
+ }
37
+ reset(mark) {
38
+ this.pos = mark.position;
39
+ }
40
+ position() {
41
+ return this.pos;
42
+ }
43
+ /**
44
+ * Get remaining tokens as an array.
45
+ */
46
+ remaining() {
47
+ return this.tokens.slice(this.pos);
48
+ }
49
+ /**
50
+ * Consume tokens while predicate is true.
51
+ */
52
+ takeWhile(predicate) {
53
+ const result = [];
54
+ while (!this.isAtEnd() && predicate(this.peek())) {
55
+ result.push(this.advance());
56
+ }
57
+ return result;
58
+ }
59
+ /**
60
+ * Skip tokens while predicate is true.
61
+ */
62
+ skipWhile(predicate) {
63
+ while (!this.isAtEnd() && predicate(this.peek())) {
64
+ this.advance();
65
+ }
66
+ }
67
+ };
68
+ function createPosition(start, end) {
69
+ return { start, end };
70
+ }
71
+ function createToken(value, kind, position, normalizedOrOptions) {
72
+ if (typeof normalizedOrOptions === "string") {
73
+ return { value, kind, position, normalized: normalizedOrOptions };
74
+ }
75
+ if (normalizedOrOptions) {
76
+ const { normalized: normalized2, stem, stemConfidence } = normalizedOrOptions;
77
+ const token = { value, kind, position };
78
+ if (normalized2 !== void 0) {
79
+ token.normalized = normalized2;
80
+ }
81
+ if (stem !== void 0) {
82
+ token.stem = stem;
83
+ if (stemConfidence !== void 0) {
84
+ token.stemConfidence = stemConfidence;
85
+ }
86
+ }
87
+ return token;
88
+ }
89
+ return { value, kind, position };
90
+ }
91
+ function isWhitespace(char) {
92
+ return /\s/.test(char);
93
+ }
94
+ function isSelectorStart(char) {
95
+ return char === "#" || char === "." || char === "[" || char === "@" || char === "*" || char === "<";
96
+ }
97
+ function isQuote(char) {
98
+ return char === '"' || char === "'" || char === "`" || char === "\u300C" || char === "\u300D";
99
+ }
100
+ function isDigit(char) {
101
+ return /\d/.test(char);
102
+ }
103
+ function isAsciiLetter(char) {
104
+ return /[a-zA-Z]/.test(char);
105
+ }
106
+ function isAsciiIdentifierChar(char) {
107
+ return /[a-zA-Z0-9_-]/.test(char);
108
+ }
109
+ function createUnicodeRangeClassifier(ranges) {
110
+ return (char) => {
111
+ const code = char.charCodeAt(0);
112
+ return ranges.some(([start, end]) => code >= start && code <= end);
113
+ };
114
+ }
115
+ function combineClassifiers(...classifiers) {
116
+ return (char) => classifiers.some((fn) => fn(char));
117
+ }
118
+ function extractCssSelector(input, startPos) {
119
+ if (startPos >= input.length) return null;
120
+ const char = input[startPos];
121
+ if (!isSelectorStart(char)) return null;
122
+ let pos = startPos;
123
+ let selector = "";
124
+ if (char === "#" || char === ".") {
125
+ selector += input[pos++];
126
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
127
+ selector += input[pos++];
128
+ }
129
+ if (selector.length <= 1) return null;
130
+ if (pos < input.length && input[pos] === "." && char === "#") {
131
+ const methodStart = pos + 1;
132
+ let methodEnd = methodStart;
133
+ while (methodEnd < input.length && isAsciiIdentifierChar(input[methodEnd])) {
134
+ methodEnd++;
135
+ }
136
+ if (methodEnd < input.length && input[methodEnd] === "(") {
137
+ return selector;
138
+ }
139
+ }
140
+ } else if (char === "[") {
141
+ let depth = 1;
142
+ let inQuote = false;
143
+ let quoteChar = null;
144
+ let escaped = false;
145
+ selector += input[pos++];
146
+ while (pos < input.length && depth > 0) {
147
+ const c = input[pos];
148
+ selector += c;
149
+ if (escaped) {
150
+ escaped = false;
151
+ } else if (c === "\\") {
152
+ escaped = true;
153
+ } else if (inQuote) {
154
+ if (c === quoteChar) {
155
+ inQuote = false;
156
+ quoteChar = null;
157
+ }
158
+ } else {
159
+ if (c === '"' || c === "'" || c === "`") {
160
+ inQuote = true;
161
+ quoteChar = c;
162
+ } else if (c === "[") {
163
+ depth++;
164
+ } else if (c === "]") {
165
+ depth--;
166
+ }
167
+ }
168
+ pos++;
169
+ }
170
+ if (depth !== 0) return null;
171
+ } else if (char === "@") {
172
+ selector += input[pos++];
173
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
174
+ selector += input[pos++];
175
+ }
176
+ if (selector.length <= 1) return null;
177
+ } else if (char === "*") {
178
+ selector += input[pos++];
179
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
180
+ selector += input[pos++];
181
+ }
182
+ if (selector.length <= 1) return null;
183
+ } else if (char === "<") {
184
+ selector += input[pos++];
185
+ if (pos >= input.length || !isAsciiLetter(input[pos])) return null;
186
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
187
+ selector += input[pos++];
188
+ }
189
+ while (pos < input.length) {
190
+ const modChar = input[pos];
191
+ if (modChar === ".") {
192
+ selector += input[pos++];
193
+ if (pos >= input.length || !isAsciiIdentifierChar(input[pos])) {
194
+ return null;
195
+ }
196
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
197
+ selector += input[pos++];
198
+ }
199
+ } else if (modChar === "#") {
200
+ selector += input[pos++];
201
+ if (pos >= input.length || !isAsciiIdentifierChar(input[pos])) {
202
+ return null;
203
+ }
204
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
205
+ selector += input[pos++];
206
+ }
207
+ } else if (modChar === "[") {
208
+ let depth = 1;
209
+ let inQuote = false;
210
+ let quoteChar = null;
211
+ let escaped = false;
212
+ selector += input[pos++];
213
+ while (pos < input.length && depth > 0) {
214
+ const c = input[pos];
215
+ selector += c;
216
+ if (escaped) {
217
+ escaped = false;
218
+ } else if (c === "\\") {
219
+ escaped = true;
220
+ } else if (inQuote) {
221
+ if (c === quoteChar) {
222
+ inQuote = false;
223
+ quoteChar = null;
224
+ }
225
+ } else {
226
+ if (c === '"' || c === "'" || c === "`") {
227
+ inQuote = true;
228
+ quoteChar = c;
229
+ } else if (c === "[") {
230
+ depth++;
231
+ } else if (c === "]") {
232
+ depth--;
233
+ }
234
+ }
235
+ pos++;
236
+ }
237
+ if (depth !== 0) return null;
238
+ } else {
239
+ break;
240
+ }
241
+ }
242
+ while (pos < input.length && isWhitespace(input[pos])) {
243
+ selector += input[pos++];
244
+ }
245
+ if (pos < input.length && input[pos] === "/") {
246
+ selector += input[pos++];
247
+ while (pos < input.length && isWhitespace(input[pos])) {
248
+ selector += input[pos++];
249
+ }
250
+ }
251
+ if (pos >= input.length || input[pos] !== ">") return null;
252
+ selector += input[pos++];
253
+ }
254
+ return selector || null;
255
+ }
256
+ function isPossessiveMarker(input, pos) {
257
+ if (pos >= input.length || input[pos] !== "'") return false;
258
+ if (pos + 1 >= input.length) return false;
259
+ const nextChar = input[pos + 1].toLowerCase();
260
+ if (nextChar !== "s") return false;
261
+ if (pos + 2 >= input.length) return true;
262
+ const afterS = input[pos + 2];
263
+ return isWhitespace(afterS) || afterS === "*" || !isAsciiIdentifierChar(afterS);
264
+ }
265
+ function extractStringLiteral(input, startPos) {
266
+ if (startPos >= input.length) return null;
267
+ const openQuote = input[startPos];
268
+ if (!isQuote(openQuote)) return null;
269
+ if (openQuote === "'" && isPossessiveMarker(input, startPos)) {
270
+ return null;
271
+ }
272
+ const closeQuoteMap = {
273
+ '"': '"',
274
+ "'": "'",
275
+ "`": "`",
276
+ "\u300C": "\u300D"
277
+ };
278
+ const closeQuote = closeQuoteMap[openQuote];
279
+ if (!closeQuote) return null;
280
+ let pos = startPos + 1;
281
+ let literal = openQuote;
282
+ let escaped = false;
283
+ while (pos < input.length) {
284
+ const char = input[pos];
285
+ literal += char;
286
+ if (escaped) {
287
+ escaped = false;
288
+ } else if (char === "\\") {
289
+ escaped = true;
290
+ } else if (char === closeQuote) {
291
+ return literal;
292
+ }
293
+ pos++;
294
+ }
295
+ return literal;
296
+ }
297
+ function isUrlStart(input, pos) {
298
+ if (pos >= input.length) return false;
299
+ const char = input[pos];
300
+ const next = input[pos + 1] || "";
301
+ const third = input[pos + 2] || "";
302
+ if (char === "/" && next !== "/" && /[a-zA-Z0-9._-]/.test(next)) {
303
+ return true;
304
+ }
305
+ if (char === "/" && next === "/" && /[a-zA-Z]/.test(third)) {
306
+ return true;
307
+ }
308
+ if (char === "." && (next === "/" || next === "." && third === "/")) {
309
+ return true;
310
+ }
311
+ const slice = input.slice(pos, pos + 8).toLowerCase();
312
+ if (slice.startsWith("http://") || slice.startsWith("https://")) {
313
+ return true;
314
+ }
315
+ return false;
316
+ }
317
+ function extractUrl(input, startPos) {
318
+ if (!isUrlStart(input, startPos)) return null;
319
+ let pos = startPos;
320
+ let url = "";
321
+ const urlChars = /[a-zA-Z0-9/:._\-?&=%@+~!$'()*,;[\]]/;
322
+ while (pos < input.length) {
323
+ const char = input[pos];
324
+ if (char === "#") {
325
+ if (url.length > 0 && /[a-zA-Z0-9/.]$/.test(url)) {
326
+ url += char;
327
+ pos++;
328
+ while (pos < input.length && /[a-zA-Z0-9_-]/.test(input[pos])) {
329
+ url += input[pos++];
330
+ }
331
+ }
332
+ break;
333
+ }
334
+ if (urlChars.test(char)) {
335
+ url += char;
336
+ pos++;
337
+ } else {
338
+ break;
339
+ }
340
+ }
341
+ if (url.length < 2) return null;
342
+ return url;
343
+ }
344
+ function extractNumber(input, startPos) {
345
+ if (startPos >= input.length) return null;
346
+ const char = input[startPos];
347
+ if (!isDigit(char) && char !== "-" && char !== "+") return null;
348
+ let pos = startPos;
349
+ let number = "";
350
+ if (input[pos] === "-" || input[pos] === "+") {
351
+ number += input[pos++];
352
+ }
353
+ if (pos >= input.length || !isDigit(input[pos])) {
354
+ return null;
355
+ }
356
+ while (pos < input.length && isDigit(input[pos])) {
357
+ number += input[pos++];
358
+ }
359
+ if (pos < input.length && input[pos] === ".") {
360
+ number += input[pos++];
361
+ while (pos < input.length && isDigit(input[pos])) {
362
+ number += input[pos++];
363
+ }
364
+ }
365
+ if (pos < input.length) {
366
+ const suffix = input.slice(pos, pos + 2);
367
+ if (suffix === "ms") {
368
+ number += "ms";
369
+ } else if (input[pos] === "s" || input[pos] === "m" || input[pos] === "h") {
370
+ number += input[pos];
371
+ }
372
+ }
373
+ return number;
374
+ }
375
+ var _BaseTokenizer = class _BaseTokenizer {
376
+ constructor() {
377
+ /** Keywords derived from profile, sorted longest-first for greedy matching */
378
+ this.profileKeywords = [];
379
+ /** Map for O(1) keyword lookups by lowercase native word */
380
+ this.profileKeywordMap = /* @__PURE__ */ new Map();
381
+ }
382
+ /**
383
+ * Initialize keyword mappings from a language profile.
384
+ * Builds a list of native→english mappings from:
385
+ * - profile.keywords (primary + alternatives)
386
+ * - profile.references (me, it, you, etc.)
387
+ * - profile.roleMarkers (into, from, with, etc.)
388
+ *
389
+ * Results are sorted longest-first for greedy matching (important for non-space languages).
390
+ * Extras take precedence over profile entries when there are duplicates.
391
+ *
392
+ * @param profile - Language profile containing keyword translations
393
+ * @param extras - Additional keyword entries to include (literals, positional, events)
394
+ */
395
+ initializeKeywordsFromProfile(profile, extras = []) {
396
+ const keywordMap = /* @__PURE__ */ new Map();
397
+ if (profile.keywords) {
398
+ for (const [normalized2, translation] of Object.entries(profile.keywords)) {
399
+ keywordMap.set(translation.primary, {
400
+ native: translation.primary,
401
+ normalized: translation.normalized || normalized2
402
+ });
403
+ if (translation.alternatives) {
404
+ for (const alt of translation.alternatives) {
405
+ keywordMap.set(alt, {
406
+ native: alt,
407
+ normalized: translation.normalized || normalized2
408
+ });
409
+ }
410
+ }
411
+ }
412
+ }
413
+ if (profile.references) {
414
+ for (const [normalized2, native] of Object.entries(profile.references)) {
415
+ keywordMap.set(native, { native, normalized: normalized2 });
416
+ }
417
+ }
418
+ if (profile.roleMarkers) {
419
+ for (const [role, marker] of Object.entries(profile.roleMarkers)) {
420
+ if (marker.primary) {
421
+ keywordMap.set(marker.primary, { native: marker.primary, normalized: role });
422
+ }
423
+ if (marker.alternatives) {
424
+ for (const alt of marker.alternatives) {
425
+ keywordMap.set(alt, { native: alt, normalized: role });
426
+ }
427
+ }
428
+ }
429
+ }
430
+ for (const extra of extras) {
431
+ keywordMap.set(extra.native, extra);
432
+ }
433
+ this.profileKeywords = Array.from(keywordMap.values()).sort(
434
+ (a, b) => b.native.length - a.native.length
435
+ );
436
+ this.profileKeywordMap = /* @__PURE__ */ new Map();
437
+ for (const keyword of this.profileKeywords) {
438
+ this.profileKeywordMap.set(keyword.native.toLowerCase(), keyword);
439
+ const normalized2 = this.removeDiacritics(keyword.native);
440
+ if (normalized2 !== keyword.native && !this.profileKeywordMap.has(normalized2.toLowerCase())) {
441
+ this.profileKeywordMap.set(normalized2.toLowerCase(), keyword);
442
+ }
443
+ }
444
+ }
445
+ /**
446
+ * Remove diacritical marks from a word for normalization.
447
+ * Primarily for Arabic (shadda, fatha, kasra, damma, sukun, etc.)
448
+ * but could be extended for other languages.
449
+ *
450
+ * @param word - Word to normalize
451
+ * @returns Word without diacritics
452
+ */
453
+ removeDiacritics(word) {
454
+ return word.replace(/[\u064B-\u0652\u0670]/g, "");
455
+ }
456
+ /**
457
+ * Try to match a keyword from profile at the current position.
458
+ * Uses longest-first greedy matching (important for non-space languages).
459
+ *
460
+ * @param input - Input string
461
+ * @param pos - Current position
462
+ * @returns Token if matched, null otherwise
463
+ */
464
+ tryProfileKeyword(input, pos) {
465
+ for (const entry of this.profileKeywords) {
466
+ if (input.slice(pos).startsWith(entry.native)) {
467
+ return createToken(
468
+ entry.native,
469
+ "keyword",
470
+ createPosition(pos, pos + entry.native.length),
471
+ entry.normalized
472
+ );
473
+ }
474
+ }
475
+ return null;
476
+ }
477
+ /**
478
+ * Check if the remaining input starts with any known keyword.
479
+ * Useful for non-space languages to detect word boundaries.
480
+ *
481
+ * @param input - Input string
482
+ * @param pos - Current position
483
+ * @returns true if a keyword starts at this position
484
+ */
485
+ isKeywordStart(input, pos) {
486
+ const remaining = input.slice(pos);
487
+ return this.profileKeywords.some((entry) => remaining.startsWith(entry.native));
488
+ }
489
+ /**
490
+ * Look up a keyword by native word (case-insensitive).
491
+ * O(1) lookup using the keyword map.
492
+ *
493
+ * @param native - Native word to look up
494
+ * @returns KeywordEntry if found, undefined otherwise
495
+ */
496
+ lookupKeyword(native) {
497
+ return this.profileKeywordMap.get(native.toLowerCase());
498
+ }
499
+ /**
500
+ * Check if a word is a known keyword (case-insensitive).
501
+ * O(1) lookup using the keyword map.
502
+ *
503
+ * @param native - Native word to check
504
+ * @returns true if the word is a keyword
505
+ */
506
+ isKeyword(native) {
507
+ return this.profileKeywordMap.has(native.toLowerCase());
508
+ }
509
+ /**
510
+ * Set the morphological normalizer for this tokenizer.
511
+ */
512
+ setNormalizer(normalizer) {
513
+ this.normalizer = normalizer;
514
+ }
515
+ /**
516
+ * Try to normalize a word using the morphological normalizer.
517
+ * Returns null if no normalizer is set or normalization fails.
518
+ *
519
+ * Note: We don't check isNormalizable() here because the individual tokenizers
520
+ * historically called normalize() directly without that check. The normalize()
521
+ * method itself handles returning noChange() for words that can't be normalized.
522
+ */
523
+ tryNormalize(word) {
524
+ if (!this.normalizer) return null;
525
+ const result = this.normalizer.normalize(word);
526
+ if (result.stem !== word && result.confidence >= 0.7) {
527
+ return result;
528
+ }
529
+ return null;
530
+ }
531
+ /**
532
+ * Try morphological normalization and keyword lookup.
533
+ *
534
+ * If the word can be normalized to a stem that matches a known keyword,
535
+ * returns a keyword token with morphological metadata (stem, stemConfidence).
536
+ *
537
+ * This is the common pattern for handling conjugated verbs across languages:
538
+ * 1. Normalize the word (e.g., "toggled" → "toggle")
539
+ * 2. Look up the stem in the keyword map
540
+ * 3. Create a token with both the original form and stem metadata
541
+ *
542
+ * @param word - The word to normalize and look up
543
+ * @param startPos - Start position for the token
544
+ * @param endPos - End position for the token
545
+ * @returns Token if stem matches a keyword, null otherwise
546
+ */
547
+ tryMorphKeywordMatch(word, startPos, endPos) {
548
+ const result = this.tryNormalize(word);
549
+ if (!result) return null;
550
+ const stemEntry = this.lookupKeyword(result.stem);
551
+ if (!stemEntry) return null;
552
+ const tokenOptions = {
553
+ normalized: stemEntry.normalized,
554
+ stem: result.stem,
555
+ stemConfidence: result.confidence
556
+ };
557
+ return createToken(word, "keyword", createPosition(startPos, endPos), tokenOptions);
558
+ }
559
+ /**
560
+ * Try to extract a CSS selector at the current position.
561
+ */
562
+ trySelector(input, pos) {
563
+ const selector = extractCssSelector(input, pos);
564
+ if (selector) {
565
+ return createToken(selector, "selector", createPosition(pos, pos + selector.length));
566
+ }
567
+ return null;
568
+ }
569
+ /**
570
+ * Try to extract an event modifier at the current position.
571
+ * Event modifiers are .once, .debounce(N), .throttle(N), .queue(strategy)
572
+ */
573
+ tryEventModifier(input, pos) {
574
+ if (input[pos] !== ".") {
575
+ return null;
576
+ }
577
+ const match = input.slice(pos).match(/^\.(?:once|debounce|throttle|queue)(?:\(([^)]+)\))?(?:\s|$|\.)/);
578
+ if (!match) {
579
+ return null;
580
+ }
581
+ const fullMatch = match[0].replace(/(\s|\.)$/, "");
582
+ const modifierName = fullMatch.slice(1).split("(")[0];
583
+ const value = match[1];
584
+ const token = createToken(
585
+ fullMatch,
586
+ "event-modifier",
587
+ createPosition(pos, pos + fullMatch.length)
588
+ );
589
+ return {
590
+ ...token,
591
+ metadata: {
592
+ modifierName,
593
+ value: value ? modifierName === "queue" ? value : parseInt(value, 10) : void 0
594
+ }
595
+ };
596
+ }
597
+ /**
598
+ * Try to extract a string literal at the current position.
599
+ */
600
+ tryString(input, pos) {
601
+ const literal = extractStringLiteral(input, pos);
602
+ if (literal) {
603
+ return createToken(literal, "literal", createPosition(pos, pos + literal.length));
604
+ }
605
+ return null;
606
+ }
607
+ /**
608
+ * Try to extract a number at the current position.
609
+ */
610
+ tryNumber(input, pos) {
611
+ const number = extractNumber(input, pos);
612
+ if (number) {
613
+ return createToken(number, "literal", createPosition(pos, pos + number.length));
614
+ }
615
+ return null;
616
+ }
617
+ /**
618
+ * Try to match a time unit from a list of patterns.
619
+ *
620
+ * @param input - Input string
621
+ * @param pos - Position after the number
622
+ * @param timeUnits - Array of time unit mappings (native pattern → standard suffix)
623
+ * @param skipWhitespace - Whether to skip whitespace before time unit (default: false)
624
+ * @returns Object with matched suffix and new position, or null if no match
625
+ */
626
+ tryMatchTimeUnit(input, pos, timeUnits, skipWhitespace = false) {
627
+ let unitPos = pos;
628
+ if (skipWhitespace) {
629
+ while (unitPos < input.length && isWhitespace(input[unitPos])) {
630
+ unitPos++;
631
+ }
632
+ }
633
+ const remaining = input.slice(unitPos);
634
+ for (const unit of timeUnits) {
635
+ const candidate = remaining.slice(0, unit.length);
636
+ const matches = unit.caseInsensitive ? candidate.toLowerCase() === unit.pattern.toLowerCase() : candidate === unit.pattern;
637
+ if (matches) {
638
+ if (unit.notFollowedBy) {
639
+ const nextChar = remaining[unit.length] || "";
640
+ if (nextChar === unit.notFollowedBy) continue;
641
+ }
642
+ if (unit.checkBoundary) {
643
+ const nextChar = remaining[unit.length] || "";
644
+ if (isAsciiIdentifierChar(nextChar)) continue;
645
+ }
646
+ return { suffix: unit.suffix, endPos: unitPos + unit.length };
647
+ }
648
+ }
649
+ return null;
650
+ }
651
+ /**
652
+ * Parse a base number (sign, integer, decimal) without time units.
653
+ * Returns the number string and end position.
654
+ *
655
+ * @param input - Input string
656
+ * @param startPos - Start position
657
+ * @param allowSign - Whether to allow +/- sign (default: true)
658
+ * @returns Object with number string and end position, or null
659
+ */
660
+ parseBaseNumber(input, startPos, allowSign = true) {
661
+ let pos = startPos;
662
+ let number = "";
663
+ if (allowSign && (input[pos] === "-" || input[pos] === "+")) {
664
+ number += input[pos++];
665
+ }
666
+ if (pos >= input.length || !isDigit(input[pos])) {
667
+ return null;
668
+ }
669
+ while (pos < input.length && isDigit(input[pos])) {
670
+ number += input[pos++];
671
+ }
672
+ if (pos < input.length && input[pos] === ".") {
673
+ number += input[pos++];
674
+ while (pos < input.length && isDigit(input[pos])) {
675
+ number += input[pos++];
676
+ }
677
+ }
678
+ if (!number || number === "-" || number === "+") return null;
679
+ return { number, endPos: pos };
680
+ }
681
+ /**
682
+ * Try to extract a number with native language time units.
683
+ *
684
+ * This is a template method that handles the common pattern:
685
+ * 1. Parse the base number (sign, integer, decimal)
686
+ * 2. Try to match native language time units
687
+ * 3. Fall back to standard time units (ms, s, m, h)
688
+ *
689
+ * @param input - Input string
690
+ * @param pos - Start position
691
+ * @param nativeTimeUnits - Language-specific time unit mappings
692
+ * @param options - Configuration options
693
+ * @returns Token if number found, null otherwise
694
+ */
695
+ tryNumberWithTimeUnits(input, pos, nativeTimeUnits, options = {}) {
696
+ const { allowSign = true, skipWhitespace = false } = options;
697
+ const baseResult = this.parseBaseNumber(input, pos, allowSign);
698
+ if (!baseResult) return null;
699
+ let { number, endPos } = baseResult;
700
+ const allUnits = [...nativeTimeUnits, ..._BaseTokenizer.STANDARD_TIME_UNITS];
701
+ const timeMatch = this.tryMatchTimeUnit(input, endPos, allUnits, skipWhitespace);
702
+ if (timeMatch) {
703
+ number += timeMatch.suffix;
704
+ endPos = timeMatch.endPos;
705
+ }
706
+ return createToken(number, "literal", createPosition(pos, endPos));
707
+ }
708
+ /**
709
+ * Try to extract a URL at the current position.
710
+ * Handles /path, ./path, ../path, //domain.com, http://, https://
711
+ */
712
+ tryUrl(input, pos) {
713
+ const url = extractUrl(input, pos);
714
+ if (url) {
715
+ return createToken(url, "url", createPosition(pos, pos + url.length));
716
+ }
717
+ return null;
718
+ }
719
+ /**
720
+ * Try to extract a variable reference (:varname) at the current position.
721
+ * In hyperscript, :x refers to a local variable named x.
722
+ */
723
+ tryVariableRef(input, pos) {
724
+ if (input[pos] !== ":") return null;
725
+ if (pos + 1 >= input.length) return null;
726
+ if (!isAsciiIdentifierChar(input[pos + 1])) return null;
727
+ let endPos = pos + 1;
728
+ while (endPos < input.length && isAsciiIdentifierChar(input[endPos])) {
729
+ endPos++;
730
+ }
731
+ const varRef = input.slice(pos, endPos);
732
+ return createToken(varRef, "identifier", createPosition(pos, endPos));
733
+ }
734
+ /**
735
+ * Try to extract an operator or punctuation token at the current position.
736
+ * Handles two-character operators (==, !=, etc.) and single-character operators.
737
+ */
738
+ tryOperator(input, pos) {
739
+ const twoChar = input.slice(pos, pos + 2);
740
+ if (["==", "!=", "<=", ">=", "&&", "||", "->"].includes(twoChar)) {
741
+ return createToken(twoChar, "operator", createPosition(pos, pos + 2));
742
+ }
743
+ const oneChar = input[pos];
744
+ if (["<", ">", "!", "+", "-", "*", "/", "="].includes(oneChar)) {
745
+ return createToken(oneChar, "operator", createPosition(pos, pos + 1));
746
+ }
747
+ if (["(", ")", "{", "}", ",", ";", ":"].includes(oneChar)) {
748
+ return createToken(oneChar, "punctuation", createPosition(pos, pos + 1));
749
+ }
750
+ return null;
751
+ }
752
+ /**
753
+ * Try to match a multi-character particle from a list.
754
+ *
755
+ * Used by languages like Japanese, Korean, and Chinese that have
756
+ * multi-character particles (e.g., Japanese から, まで, より).
757
+ *
758
+ * @param input - Input string
759
+ * @param pos - Current position
760
+ * @param particles - Array of multi-character particles to match
761
+ * @returns Token if matched, null otherwise
762
+ */
763
+ tryMultiCharParticle(input, pos, particles) {
764
+ for (const particle of particles) {
765
+ if (input.slice(pos, pos + particle.length) === particle) {
766
+ return createToken(particle, "particle", createPosition(pos, pos + particle.length));
767
+ }
768
+ }
769
+ return null;
770
+ }
771
+ };
772
+ /**
773
+ * Configuration for native language time units.
774
+ * Maps patterns to their standard suffix (ms, s, m, h).
775
+ */
776
+ _BaseTokenizer.STANDARD_TIME_UNITS = [
777
+ { pattern: "ms", suffix: "ms", length: 2 },
778
+ { pattern: "s", suffix: "s", length: 1, checkBoundary: true },
779
+ { pattern: "m", suffix: "m", length: 1, checkBoundary: true, notFollowedBy: "s" },
780
+ { pattern: "h", suffix: "h", length: 1, checkBoundary: true }
781
+ ];
782
+ var BaseTokenizer = _BaseTokenizer;
783
+
784
+ // src/tokenizers/morphology/types.ts
785
+ function noChange(word) {
786
+ return { stem: word, confidence: 1 };
787
+ }
788
+ function normalized(stem, confidence, metadata) {
789
+ if (metadata) {
790
+ return { stem, confidence, metadata };
791
+ }
792
+ return { stem, confidence };
793
+ }
794
+
795
+ // src/tokenizers/morphology/korean-normalizer.ts
796
+ function isHangul(char) {
797
+ const code = char.charCodeAt(0);
798
+ return code >= 44032 && code <= 55203;
799
+ }
800
+ function containsKorean(word) {
801
+ for (const char of word) {
802
+ if (isHangul(char)) return true;
803
+ }
804
+ return false;
805
+ }
806
+ var KOREAN_SUFFIX_RULES = [
807
+ // Honorific conditional/temporal forms (-시- infix)
808
+ // These are critical for polite/formal Korean
809
+ { pattern: "\uD558\uC2DC\uB2C8\uAE4C", confidence: 0.85, conjugationType: "honorific-causal", minStemLength: 1 },
810
+ { pattern: "\uD558\uC2E4\uB54C", confidence: 0.88, conjugationType: "honorific-temporal", minStemLength: 1 },
811
+ { pattern: "\uD558\uC2E4 \uB54C", confidence: 0.88, conjugationType: "honorific-temporal", minStemLength: 1 },
812
+ {
813
+ pattern: "\uD558\uC2DC\uBA74",
814
+ confidence: 0.88,
815
+ conjugationType: "honorific-conditional",
816
+ minStemLength: 1
817
+ },
818
+ {
819
+ pattern: "\uC73C\uC2DC\uBA74",
820
+ confidence: 0.85,
821
+ conjugationType: "honorific-conditional",
822
+ minStemLength: 2
823
+ },
824
+ { pattern: "\uC2DC\uBA74", confidence: 0.82, conjugationType: "honorific-conditional", minStemLength: 2 },
825
+ // Sequential/temporal forms - "after doing", "before doing", "as soon as"
826
+ { pattern: "\uD558\uACE0\uB098\uC11C", confidence: 0.85, conjugationType: "sequential-after", minStemLength: 1 },
827
+ { pattern: "\uD558\uACE0 \uB098\uC11C", confidence: 0.85, conjugationType: "sequential-after", minStemLength: 1 },
828
+ { pattern: "\uD558\uACE0\uC11C", confidence: 0.85, conjugationType: "sequential-after", minStemLength: 1 },
829
+ { pattern: "\uACE0\uB098\uC11C", confidence: 0.82, conjugationType: "sequential-after", minStemLength: 2 },
830
+ { pattern: "\uACE0 \uB098\uC11C", confidence: 0.82, conjugationType: "sequential-after", minStemLength: 2 },
831
+ { pattern: "\uACE0\uC11C", confidence: 0.82, conjugationType: "sequential-after", minStemLength: 2 },
832
+ { pattern: "\uD558\uAE30\uC804\uC5D0", confidence: 0.85, conjugationType: "sequential-before", minStemLength: 1 },
833
+ {
834
+ pattern: "\uD558\uAE30 \uC804\uC5D0",
835
+ confidence: 0.85,
836
+ conjugationType: "sequential-before",
837
+ minStemLength: 1
838
+ },
839
+ { pattern: "\uAE30\uC804\uC5D0", confidence: 0.82, conjugationType: "sequential-before", minStemLength: 2 },
840
+ { pattern: "\uAE30 \uC804\uC5D0", confidence: 0.82, conjugationType: "sequential-before", minStemLength: 2 },
841
+ { pattern: "\uD558\uC790\uB9C8\uC790", confidence: 0.88, conjugationType: "immediate", minStemLength: 1 },
842
+ { pattern: "\uC790\uB9C8\uC790", confidence: 0.85, conjugationType: "immediate", minStemLength: 2 },
843
+ // Obligation forms - "must do", "should do"
844
+ { pattern: "\uD574\uC57C\uD574\uC694", confidence: 0.85, conjugationType: "obligation", minStemLength: 1 },
845
+ { pattern: "\uD574\uC57C\uD574", confidence: 0.85, conjugationType: "obligation", minStemLength: 1 },
846
+ { pattern: "\uD574\uC57C\uD558\uB2E4", confidence: 0.85, conjugationType: "obligation", minStemLength: 1 },
847
+ { pattern: "\uC5B4\uC57C\uD574\uC694", confidence: 0.82, conjugationType: "obligation", minStemLength: 2 },
848
+ { pattern: "\uC5B4\uC57C\uD574", confidence: 0.82, conjugationType: "obligation", minStemLength: 2 },
849
+ { pattern: "\uC544\uC57C\uD574\uC694", confidence: 0.82, conjugationType: "obligation", minStemLength: 2 },
850
+ { pattern: "\uC544\uC57C\uD574", confidence: 0.82, conjugationType: "obligation", minStemLength: 2 },
851
+ // Conditional forms - most natural for event handlers (longest first)
852
+ // These are critical for native Korean idioms like "클릭하면 증가"
853
+ { pattern: "\uD558\uB2C8\uAE4C", confidence: 0.85, conjugationType: "causal-nikka", minStemLength: 1 },
854
+ { pattern: "\uD560\uB54C", confidence: 0.88, conjugationType: "temporal-ttae", minStemLength: 1 },
855
+ { pattern: "\uD560 \uB54C", confidence: 0.88, conjugationType: "temporal-ttae", minStemLength: 1 },
856
+ { pattern: "\uC744\uB54C", confidence: 0.85, conjugationType: "temporal-ttae", minStemLength: 2 },
857
+ { pattern: "\uC744 \uB54C", confidence: 0.85, conjugationType: "temporal-ttae", minStemLength: 2 },
858
+ { pattern: "\uD558\uBA74", confidence: 0.88, conjugationType: "conditional-myeon", minStemLength: 1 },
859
+ { pattern: "\uC73C\uBA74", confidence: 0.85, conjugationType: "conditional-myeon", minStemLength: 2 },
860
+ { pattern: "\uB2C8\uAE4C", confidence: 0.82, conjugationType: "causal-nikka", minStemLength: 2 },
861
+ { pattern: "\uBA74", confidence: 0.8, conjugationType: "conditional-myeon", minStemLength: 2 },
862
+ // Formal polite forms (longest first)
863
+ { pattern: "\uD558\uC600\uC2B5\uB2C8\uB2E4", confidence: 0.85, conjugationType: "past", minStemLength: 1 },
864
+ { pattern: "\uD588\uC2B5\uB2C8\uB2E4", confidence: 0.85, conjugationType: "past", minStemLength: 1 },
865
+ { pattern: "\uD569\uB2C8\uB2E4", confidence: 0.85, conjugationType: "polite", minStemLength: 1 },
866
+ { pattern: "\uC2B5\uB2C8\uB2E4", confidence: 0.82, conjugationType: "polite", minStemLength: 2 },
867
+ { pattern: "\uB429\uB2C8\uB2E4", confidence: 0.82, conjugationType: "polite", minStemLength: 1 },
868
+ { pattern: "\u3142\uB2C8\uB2E4", confidence: 0.82, conjugationType: "polite", minStemLength: 2 },
869
+ // Honorific request forms
870
+ { pattern: "\uD558\uC138\uC694", confidence: 0.85, conjugationType: "honorific", minStemLength: 1 },
871
+ { pattern: "\uD558\uC2ED\uC2DC\uC624", confidence: 0.85, conjugationType: "honorific", minStemLength: 1 },
872
+ { pattern: "\uC138\uC694", confidence: 0.82, conjugationType: "honorific", minStemLength: 2 },
873
+ { pattern: "\uC2ED\uC2DC\uC624", confidence: 0.82, conjugationType: "honorific", minStemLength: 2 },
874
+ // Informal polite (요) forms
875
+ { pattern: "\uD558\uACE0\uC788\uC5B4\uC694", confidence: 0.82, conjugationType: "progressive", minStemLength: 1 },
876
+ { pattern: "\uD558\uACE0\uC788\uC5B4", confidence: 0.82, conjugationType: "progressive", minStemLength: 1 },
877
+ { pattern: "\uD588\uC5B4\uC694", confidence: 0.85, conjugationType: "past", minStemLength: 1 },
878
+ { pattern: "\uD574\uC694", confidence: 0.85, conjugationType: "polite", minStemLength: 1 },
879
+ { pattern: "\uC5B4\uC694", confidence: 0.82, conjugationType: "polite", minStemLength: 2 },
880
+ { pattern: "\uC544\uC694", confidence: 0.82, conjugationType: "polite", minStemLength: 2 },
881
+ // Informal (반말) forms
882
+ { pattern: "\uD588\uC5B4", confidence: 0.85, conjugationType: "past", minStemLength: 1 },
883
+ { pattern: "\uD574", confidence: 0.8, conjugationType: "present", minStemLength: 1 },
884
+ { pattern: "\uC5C8\uC5B4", confidence: 0.82, conjugationType: "past", minStemLength: 2 },
885
+ { pattern: "\uC558\uC5B4", confidence: 0.82, conjugationType: "past", minStemLength: 2 },
886
+ // Progressive forms
887
+ { pattern: "\uD558\uACE0\uC788\uB2E4", confidence: 0.82, conjugationType: "progressive", minStemLength: 1 },
888
+ { pattern: "\uACE0\uC788\uB2E4", confidence: 0.8, conjugationType: "progressive", minStemLength: 2 },
889
+ { pattern: "\uACE0\uC788\uC5B4", confidence: 0.8, conjugationType: "progressive", minStemLength: 2 },
890
+ // Dictionary/infinitive form (하다 verbs)
891
+ { pattern: "\uD558\uB2E4", confidence: 0.88, conjugationType: "dictionary", minStemLength: 1 },
892
+ // Negative forms
893
+ { pattern: "\uD558\uC9C0\uC54A\uB2E4", confidence: 0.82, conjugationType: "negative", minStemLength: 1 },
894
+ { pattern: "\uC548\uD558\uB2E4", confidence: 0.82, conjugationType: "negative", minStemLength: 1 },
895
+ { pattern: "\uC9C0\uC54A\uB2E4", confidence: 0.8, conjugationType: "negative", minStemLength: 2 },
896
+ // Imperative forms
897
+ { pattern: "\uD574\uB77C", confidence: 0.82, conjugationType: "imperative", minStemLength: 1 },
898
+ { pattern: "\uD558\uB77C", confidence: 0.82, conjugationType: "imperative", minStemLength: 1 },
899
+ // Generic verb endings (lower confidence)
900
+ { pattern: "\uB2E4", confidence: 0.75, conjugationType: "dictionary", minStemLength: 2 }
901
+ ];
902
+ var HADA_PATTERNS = [
903
+ // Honorific forms (-시- infix) - polite/formal Korean
904
+ // 클릭하시면 → 클릭 (if you click - honorific)
905
+ { pattern: "\uD558\uC2DC\uB2C8\uAE4C", confidence: 0.88, conjugationType: "honorific-causal" },
906
+ { pattern: "\uD558\uC2E4\uB54C", confidence: 0.88, conjugationType: "honorific-temporal" },
907
+ { pattern: "\uD558\uC2E4 \uB54C", confidence: 0.88, conjugationType: "honorific-temporal" },
908
+ { pattern: "\uD558\uC2DC\uBA74", confidence: 0.88, conjugationType: "honorific-conditional" },
909
+ { pattern: "\uD558\uC168\uC5B4\uC694", confidence: 0.85, conjugationType: "honorific-past" },
910
+ { pattern: "\uD558\uC168\uC5B4", confidence: 0.85, conjugationType: "honorific-past" },
911
+ { pattern: "\uD558\uC2ED\uB2C8\uB2E4", confidence: 0.85, conjugationType: "honorific-polite" },
912
+ // Sequential/temporal forms - "after doing", "before doing", "as soon as"
913
+ { pattern: "\uD558\uACE0\uB098\uC11C", confidence: 0.88, conjugationType: "sequential-after" },
914
+ { pattern: "\uD558\uACE0 \uB098\uC11C", confidence: 0.88, conjugationType: "sequential-after" },
915
+ { pattern: "\uD558\uACE0\uC11C", confidence: 0.88, conjugationType: "sequential-after" },
916
+ { pattern: "\uD558\uAE30\uC804\uC5D0", confidence: 0.88, conjugationType: "sequential-before" },
917
+ { pattern: "\uD558\uAE30 \uC804\uC5D0", confidence: 0.88, conjugationType: "sequential-before" },
918
+ { pattern: "\uD558\uC790\uB9C8\uC790", confidence: 0.88, conjugationType: "immediate" },
919
+ // Obligation forms - "must do", "should do"
920
+ { pattern: "\uD574\uC57C\uD574\uC694", confidence: 0.88, conjugationType: "obligation" },
921
+ { pattern: "\uD574\uC57C\uD574", confidence: 0.88, conjugationType: "obligation" },
922
+ { pattern: "\uD574\uC57C\uD558\uB2E4", confidence: 0.88, conjugationType: "obligation" },
923
+ // Conditional forms - most natural for event handlers (highest priority)
924
+ // 클릭하면 → 클릭 (if clicked)
925
+ { pattern: "\uD558\uB2C8\uAE4C", confidence: 0.88, conjugationType: "causal-nikka" },
926
+ { pattern: "\uD560\uB54C", confidence: 0.88, conjugationType: "temporal-ttae" },
927
+ { pattern: "\uD560 \uB54C", confidence: 0.88, conjugationType: "temporal-ttae" },
928
+ { pattern: "\uD558\uBA74", confidence: 0.88, conjugationType: "conditional-myeon" },
929
+ // Formal
930
+ { pattern: "\uD558\uC600\uC2B5\uB2C8\uB2E4", confidence: 0.85, conjugationType: "past" },
931
+ { pattern: "\uD588\uC2B5\uB2C8\uB2E4", confidence: 0.85, conjugationType: "past" },
932
+ { pattern: "\uD569\uB2C8\uB2E4", confidence: 0.85, conjugationType: "polite" },
933
+ { pattern: "\uD558\uC2ED\uC2DC\uC624", confidence: 0.85, conjugationType: "honorific" },
934
+ { pattern: "\uD558\uC138\uC694", confidence: 0.85, conjugationType: "honorific" },
935
+ // Informal polite
936
+ { pattern: "\uD588\uC5B4\uC694", confidence: 0.85, conjugationType: "past" },
937
+ { pattern: "\uD574\uC694", confidence: 0.85, conjugationType: "polite" },
938
+ // Informal
939
+ { pattern: "\uD588\uC5B4", confidence: 0.85, conjugationType: "past" },
940
+ { pattern: "\uD574", confidence: 0.8, conjugationType: "present" },
941
+ // Progressive
942
+ { pattern: "\uD558\uACE0\uC788\uC5B4\uC694", confidence: 0.82, conjugationType: "progressive" },
943
+ { pattern: "\uD558\uACE0\uC788\uC5B4", confidence: 0.82, conjugationType: "progressive" },
944
+ { pattern: "\uD558\uACE0\uC788\uB2E4", confidence: 0.82, conjugationType: "progressive" },
945
+ // Connective forms (해서 = because/so, 하고 = and)
946
+ { pattern: "\uD574\uC11C", confidence: 0.82, conjugationType: "connective" },
947
+ { pattern: "\uD558\uACE0", confidence: 0.8, conjugationType: "connective" },
948
+ // Negative
949
+ { pattern: "\uD558\uC9C0\uC54A\uC544\uC694", confidence: 0.82, conjugationType: "negative" },
950
+ { pattern: "\uD558\uC9C0\uC54A\uB2E4", confidence: 0.82, conjugationType: "negative" },
951
+ { pattern: "\uC548\uD574\uC694", confidence: 0.82, conjugationType: "negative" },
952
+ { pattern: "\uC548\uD574", confidence: 0.82, conjugationType: "negative" },
953
+ // Imperative
954
+ { pattern: "\uD574\uB77C", confidence: 0.82, conjugationType: "imperative" },
955
+ { pattern: "\uD558\uB77C", confidence: 0.82, conjugationType: "imperative" },
956
+ // Dictionary form
957
+ { pattern: "\uD558\uB2E4", confidence: 0.88, conjugationType: "dictionary" }
958
+ ];
959
+ var KoreanMorphologicalNormalizer = class {
960
+ constructor() {
961
+ this.language = "ko";
962
+ }
963
+ /**
964
+ * Check if a word might be a Korean verb that can be normalized.
965
+ */
966
+ isNormalizable(word) {
967
+ if (!containsKorean(word)) return false;
968
+ if (word.length < 2) return false;
969
+ return true;
970
+ }
971
+ /**
972
+ * Normalize a Korean word to its stem form.
973
+ */
974
+ normalize(word) {
975
+ const compoundResult = this.normalizeCompound(word);
976
+ if (compoundResult) return compoundResult;
977
+ const hadaResult = this.tryHadaNormalization(word);
978
+ if (hadaResult) return hadaResult;
979
+ for (const rule of KOREAN_SUFFIX_RULES) {
980
+ if (word.endsWith(rule.pattern)) {
981
+ const stem = word.slice(0, -rule.pattern.length);
982
+ const minLength = rule.minStemLength ?? 2;
983
+ if (stem.length < minLength) continue;
984
+ const metadata = {
985
+ removedSuffixes: [rule.pattern]
986
+ };
987
+ if (rule.conjugationType) {
988
+ metadata.conjugationType = rule.conjugationType;
989
+ }
990
+ return normalized(stem, rule.confidence, metadata);
991
+ }
992
+ }
993
+ return noChange(word);
994
+ }
995
+ /**
996
+ * Try to normalize a 하다 verb.
997
+ * 하다 verbs are formed by noun + 하다, very common in Korean.
998
+ */
999
+ tryHadaNormalization(word) {
1000
+ for (const pattern of HADA_PATTERNS) {
1001
+ if (word.endsWith(pattern.pattern)) {
1002
+ const stem = word.slice(0, -pattern.pattern.length);
1003
+ if (stem.length < 1) continue;
1004
+ return normalized(stem, pattern.confidence, {
1005
+ removedSuffixes: [pattern.pattern],
1006
+ conjugationType: pattern.conjugationType,
1007
+ originalForm: "hada-verb"
1008
+ });
1009
+ }
1010
+ }
1011
+ return null;
1012
+ }
1013
+ /**
1014
+ * Normalize compound conjugations (multi-layer suffixes).
1015
+ * Korean has complex compound forms that combine multiple grammatical elements.
1016
+ */
1017
+ normalizeCompound(word) {
1018
+ const compoundPatterns = [
1019
+ // Sequential past forms (after doing, was)
1020
+ {
1021
+ pattern: "\uD558\uACE0\uB098\uC11C\uC600\uC5B4",
1022
+ suffixes: ["\uD558\uACE0\uB098\uC11C", "\uC600\uC5B4"],
1023
+ confidence: 0.78,
1024
+ conjugationType: "sequential-after",
1025
+ minStemLength: 2
1026
+ },
1027
+ {
1028
+ pattern: "\uD558\uACE0\uB098\uC11C\uC600\uB2E4",
1029
+ suffixes: ["\uD558\uACE0\uB098\uC11C", "\uC600\uB2E4"],
1030
+ confidence: 0.78,
1031
+ conjugationType: "sequential-after",
1032
+ minStemLength: 2
1033
+ },
1034
+ {
1035
+ pattern: "\uD558\uACE0\uB098\uC11C",
1036
+ suffixes: ["\uD558\uACE0", "\uB098\uC11C"],
1037
+ confidence: 0.85,
1038
+ conjugationType: "sequential-after",
1039
+ minStemLength: 2
1040
+ },
1041
+ // Modal necessity past forms (had to do)
1042
+ {
1043
+ pattern: "\uD574\uC57C\uD588\uC5B4",
1044
+ suffixes: ["\uD574\uC57C", "\uD588\uC5B4"],
1045
+ confidence: 0.8,
1046
+ conjugationType: "obligation",
1047
+ minStemLength: 2
1048
+ },
1049
+ {
1050
+ pattern: "\uD574\uC57C\uD588\uB2E4",
1051
+ suffixes: ["\uD574\uC57C", "\uD588\uB2E4"],
1052
+ confidence: 0.8,
1053
+ conjugationType: "obligation",
1054
+ minStemLength: 2
1055
+ },
1056
+ {
1057
+ pattern: "\uD574\uC57C\uD588\uC2B5\uB2C8\uB2E4",
1058
+ suffixes: ["\uD574\uC57C", "\uD588\uC2B5\uB2C8\uB2E4"],
1059
+ confidence: 0.8,
1060
+ conjugationType: "obligation",
1061
+ minStemLength: 2
1062
+ },
1063
+ // Honorific simultaneous forms (while doing, honorific)
1064
+ {
1065
+ pattern: "\uD558\uC2DC\uBA74\uC11C",
1066
+ suffixes: ["\uD558\uC2DC", "\uBA74\uC11C"],
1067
+ confidence: 0.82,
1068
+ conjugationType: "connective",
1069
+ minStemLength: 2
1070
+ },
1071
+ {
1072
+ pattern: "\uD558\uC2DC\uBA70",
1073
+ suffixes: ["\uD558\uC2DC", "\uBA70"],
1074
+ confidence: 0.82,
1075
+ conjugationType: "connective",
1076
+ minStemLength: 2
1077
+ },
1078
+ // Progressive forms with copula
1079
+ {
1080
+ pattern: "\uD558\uACE0\uC788\uC5C8\uC5B4",
1081
+ suffixes: ["\uD558\uACE0", "\uC788\uC5C8\uC5B4"],
1082
+ confidence: 0.8,
1083
+ conjugationType: "progressive",
1084
+ minStemLength: 2
1085
+ },
1086
+ {
1087
+ pattern: "\uD558\uACE0\uC788\uC5C8\uB2E4",
1088
+ suffixes: ["\uD558\uACE0", "\uC788\uC5C8\uB2E4"],
1089
+ confidence: 0.8,
1090
+ conjugationType: "progressive",
1091
+ minStemLength: 2
1092
+ }
1093
+ ];
1094
+ for (const {
1095
+ pattern,
1096
+ suffixes,
1097
+ confidence,
1098
+ conjugationType,
1099
+ minStemLength
1100
+ } of compoundPatterns) {
1101
+ if (word.endsWith(pattern)) {
1102
+ const stem = word.slice(0, -pattern.length);
1103
+ if (stem.length < minStemLength) continue;
1104
+ return normalized(stem, confidence, {
1105
+ removedSuffixes: suffixes,
1106
+ conjugationType
1107
+ });
1108
+ }
1109
+ }
1110
+ return null;
1111
+ }
1112
+ };
1113
+ var koreanMorphologicalNormalizer = new KoreanMorphologicalNormalizer();
1114
+
1115
+ // src/generators/profiles/korean.ts
1116
+ var koreanProfile = {
1117
+ code: "ko",
1118
+ name: "Korean",
1119
+ nativeName: "\uD55C\uAD6D\uC5B4",
1120
+ direction: "ltr",
1121
+ wordOrder: "SOV",
1122
+ markingStrategy: "particle",
1123
+ usesSpaces: true,
1124
+ // Korean uses spaces between words, but particles attach
1125
+ verb: {
1126
+ position: "end",
1127
+ suffixes: ["\uB2E4", "\uC694", "\uB2C8\uB2E4", "\uC138\uC694"],
1128
+ subjectDrop: true
1129
+ },
1130
+ references: {
1131
+ me: "\uB098",
1132
+ // "I/me" (informal)
1133
+ it: "\uADF8\uAC83",
1134
+ // "it"
1135
+ you: "\uB108",
1136
+ // "you" (informal)
1137
+ result: "\uACB0\uACFC",
1138
+ event: "\uC774\uBCA4\uD2B8",
1139
+ target: "\uB300\uC0C1",
1140
+ body: "\uBCF8\uBB38"
1141
+ },
1142
+ possessive: {
1143
+ marker: "\uC758",
1144
+ // Possessive particle
1145
+ markerPosition: "between",
1146
+ specialForms: {
1147
+ me: "\uB0B4",
1148
+ // Contracted form of 나의 (my)
1149
+ it: "\uADF8\uAC83\uC758",
1150
+ // "its"
1151
+ you: "\uB124"
1152
+ // Contracted form of 너의 (your)
1153
+ },
1154
+ keywords: {
1155
+ \uB0B4: "me",
1156
+ // nae (my)
1157
+ \uB124: "you",
1158
+ // ne (your)
1159
+ \uADF8\uC758: "it"
1160
+ // geu-ui (its/his)
1161
+ }
1162
+ },
1163
+ roleMarkers: {
1164
+ patient: { primary: "\uC744", alternatives: ["\uB97C"], position: "after" },
1165
+ destination: { primary: "\uC5D0", alternatives: ["\uC73C\uB85C", "\uB85C", "\uC5D0\uC11C"], position: "after" },
1166
+ source: { primary: "\uC5D0\uC11C", alternatives: ["\uBD80\uD130"], position: "after" },
1167
+ style: { primary: "\uB85C", alternatives: ["\uC73C\uB85C"], position: "after" },
1168
+ event: { primary: "\uC744", alternatives: ["\uB97C"], position: "after" }
1169
+ // Event as object marker
1170
+ },
1171
+ keywords: {
1172
+ // Class/Attribute operations
1173
+ toggle: { primary: "\uD1A0\uAE00", alternatives: ["\uC804\uD658"], normalized: "toggle" },
1174
+ add: { primary: "\uCD94\uAC00", normalized: "add" },
1175
+ remove: { primary: "\uC81C\uAC70", alternatives: ["\uC0AD\uC81C"], normalized: "remove" },
1176
+ // Content operations
1177
+ put: { primary: "\uB123\uB2E4", alternatives: ["\uB123\uAE30", "\uB193\uAE30"], normalized: "put" },
1178
+ append: { primary: "\uCD94\uAC00", normalized: "append" },
1179
+ take: { primary: "\uAC00\uC838\uC624\uB2E4", normalized: "take" },
1180
+ make: { primary: "\uB9CC\uB4E4\uB2E4", normalized: "make" },
1181
+ clone: { primary: "\uBCF5\uC0AC", normalized: "clone" },
1182
+ swap: { primary: "\uAD50\uD658", alternatives: ["\uBC14\uAFB8\uB2E4"], normalized: "swap" },
1183
+ morph: { primary: "\uBCC0\uD615", alternatives: ["\uBCC0\uD658"], normalized: "morph" },
1184
+ // Variable operations
1185
+ set: { primary: "\uC124\uC815", normalized: "set" },
1186
+ get: { primary: "\uC5BB\uB2E4", alternatives: ["\uAC00\uC838\uC624\uAE30"], normalized: "get" },
1187
+ increment: { primary: "\uC99D\uAC00", normalized: "increment" },
1188
+ decrement: { primary: "\uAC10\uC18C", normalized: "decrement" },
1189
+ log: { primary: "\uB85C\uADF8", normalized: "log" },
1190
+ // Visibility
1191
+ show: { primary: "\uBCF4\uC774\uB2E4", alternatives: ["\uD45C\uC2DC", "\uBCF4\uC774\uAE30"], normalized: "show" },
1192
+ hide: { primary: "\uC228\uAE30\uB2E4", alternatives: ["\uC228\uAE30\uAE30"], normalized: "hide" },
1193
+ transition: { primary: "\uC804\uD658", normalized: "transition" },
1194
+ // Events
1195
+ on: { primary: "\uC5D0", alternatives: ["\uC2DC", "\uB54C", "\uD560 \uB54C"], normalized: "on" },
1196
+ trigger: { primary: "\uD2B8\uB9AC\uAC70", normalized: "trigger" },
1197
+ send: { primary: "\uBCF4\uB0B4\uB2E4", normalized: "send" },
1198
+ // DOM focus
1199
+ focus: { primary: "\uD3EC\uCEE4\uC2A4", normalized: "focus" },
1200
+ blur: { primary: "\uBE14\uB7EC", normalized: "blur" },
1201
+ // Navigation
1202
+ go: { primary: "\uC774\uB3D9", normalized: "go" },
1203
+ // Async
1204
+ wait: { primary: "\uB300\uAE30", normalized: "wait" },
1205
+ fetch: { primary: "\uAC00\uC838\uC624\uAE30", normalized: "fetch" },
1206
+ settle: { primary: "\uC548\uC815", normalized: "settle" },
1207
+ // Control flow
1208
+ if: { primary: "\uB9CC\uC57D", normalized: "if" },
1209
+ when: { primary: "\uB54C", normalized: "when" },
1210
+ where: { primary: "\uC5B4\uB514", normalized: "where" },
1211
+ else: { primary: "\uC544\uB2C8\uBA74", normalized: "else" },
1212
+ repeat: { primary: "\uBC18\uBCF5", normalized: "repeat" },
1213
+ for: { primary: "\uB3D9\uC548", normalized: "for" },
1214
+ while: { primary: "\uB3D9\uC548", normalized: "while" },
1215
+ continue: { primary: "\uACC4\uC18D", normalized: "continue" },
1216
+ halt: { primary: "\uC815\uC9C0", normalized: "halt" },
1217
+ throw: { primary: "\uB358\uC9C0\uB2E4", normalized: "throw" },
1218
+ call: { primary: "\uD638\uCD9C", normalized: "call" },
1219
+ return: { primary: "\uBC18\uD658", normalized: "return" },
1220
+ then: { primary: "\uADF8\uB2E4\uC74C", alternatives: ["\uADF8\uB9AC\uACE0", "\uADF8\uB7F0\uD6C4"], normalized: "then" },
1221
+ and: { primary: "\uADF8\uB9AC\uACE0", alternatives: ["\uB610\uD55C", "\uBC0F"], normalized: "and" },
1222
+ end: { primary: "\uB05D", alternatives: ["\uC885\uB8CC", "\uB9C8\uCE68"], normalized: "end" },
1223
+ // Advanced
1224
+ js: { primary: "JS\uC2E4\uD589", alternatives: ["js"], normalized: "js" },
1225
+ async: { primary: "\uBE44\uB3D9\uAE30", normalized: "async" },
1226
+ tell: { primary: "\uB9D0\uD558\uB2E4", normalized: "tell" },
1227
+ default: { primary: "\uAE30\uBCF8\uAC12", normalized: "default" },
1228
+ init: { primary: "\uCD08\uAE30\uD654", normalized: "init" },
1229
+ behavior: { primary: "\uB3D9\uC791", normalized: "behavior" },
1230
+ install: { primary: "\uC124\uCE58", normalized: "install" },
1231
+ measure: { primary: "\uCE21\uC815", normalized: "measure" },
1232
+ // Modifiers
1233
+ into: { primary: "\uC73C\uB85C", normalized: "into" },
1234
+ before: { primary: "\uC804\uC5D0", normalized: "before" },
1235
+ after: { primary: "\uD6C4\uC5D0", normalized: "after" },
1236
+ // Event modifiers (for repeat until event)
1237
+ until: { primary: "\uAE4C\uC9C0", normalized: "until" },
1238
+ event: { primary: "\uC774\uBCA4\uD2B8", normalized: "event" },
1239
+ from: { primary: "\uC5D0\uC11C", normalized: "from" }
1240
+ },
1241
+ tokenization: {
1242
+ particles: ["\uC744", "\uB97C", "\uC774", "\uAC00", "\uC740", "\uB294", "\uC5D0", "\uC5D0\uC11C", "\uC73C\uB85C", "\uB85C", "\uC640", "\uACFC", "\uB3C4"],
1243
+ boundaryStrategy: "space"
1244
+ }
1245
+ };
1246
+
1247
+ // src/tokenizers/korean.ts
1248
+ var isHangul2 = createUnicodeRangeClassifier([[44032, 55203]]);
1249
+ var isJamo = createUnicodeRangeClassifier([
1250
+ [4352, 4607],
1251
+ // Hangul Jamo
1252
+ [12592, 12687]
1253
+ // Hangul Compatibility Jamo
1254
+ ]);
1255
+ var isKorean = combineClassifiers(isHangul2, isJamo);
1256
+ var PARTICLES = /* @__PURE__ */ new Set([
1257
+ // Subject markers
1258
+ "\uC774",
1259
+ // i - after consonant
1260
+ "\uAC00",
1261
+ // ga - after vowel
1262
+ // Object markers
1263
+ "\uC744",
1264
+ // eul - after consonant
1265
+ "\uB97C",
1266
+ // reul - after vowel
1267
+ // Topic markers
1268
+ "\uC740",
1269
+ // eun - after consonant
1270
+ "\uB294",
1271
+ // neun - after vowel
1272
+ // Location/time markers
1273
+ "\uC5D0",
1274
+ // e - at, to
1275
+ "\uC5D0\uC11C",
1276
+ // eseo - at (action location), from
1277
+ "\uB85C",
1278
+ // ro - to, by means (after vowel or ㄹ)
1279
+ "\uC73C\uB85C",
1280
+ // euro - to, by means (after consonant)
1281
+ // Others
1282
+ "\uC640",
1283
+ // wa - and, with (after vowel)
1284
+ "\uACFC",
1285
+ // gwa - and, with (after consonant)
1286
+ "\uC758",
1287
+ // ui - possessive ('s)
1288
+ "\uB3C4",
1289
+ // do - also
1290
+ "\uB9CC",
1291
+ // man - only
1292
+ "\uBD80\uD130",
1293
+ // buteo - from
1294
+ "\uAE4C\uC9C0",
1295
+ // kkaji - until
1296
+ "\uCC98\uB7FC",
1297
+ // cheoreom - like
1298
+ "\uBCF4\uB2E4"
1299
+ // boda - than
1300
+ ]);
1301
+ var SINGLE_CHAR_PARTICLES = /* @__PURE__ */ new Set([
1302
+ "\uC774",
1303
+ "\uAC00",
1304
+ "\uC744",
1305
+ "\uB97C",
1306
+ "\uC740",
1307
+ "\uB294",
1308
+ "\uC5D0",
1309
+ "\uB85C",
1310
+ "\uC640",
1311
+ "\uACFC",
1312
+ "\uC758",
1313
+ "\uB3C4",
1314
+ "\uB9CC"
1315
+ ]);
1316
+ var MULTI_CHAR_PARTICLES = ["\uC5D0\uC11C", "\uC73C\uB85C", "\uBD80\uD130", "\uAE4C\uC9C0", "\uCC98\uB7FC", "\uBCF4\uB2E4"];
1317
+ var PARTICLE_ROLES = /* @__PURE__ */ new Map([
1318
+ // Subject markers (vowel harmony pair)
1319
+ [
1320
+ "\uC774",
1321
+ {
1322
+ role: "agent",
1323
+ confidence: 0.85,
1324
+ variant: "consonant",
1325
+ description: "subject marker (after consonant)"
1326
+ }
1327
+ ],
1328
+ [
1329
+ "\uAC00",
1330
+ {
1331
+ role: "agent",
1332
+ confidence: 0.85,
1333
+ variant: "vowel",
1334
+ description: "subject marker (after vowel)"
1335
+ }
1336
+ ],
1337
+ // Object markers (vowel harmony pair)
1338
+ [
1339
+ "\uC744",
1340
+ {
1341
+ role: "patient",
1342
+ confidence: 0.95,
1343
+ variant: "consonant",
1344
+ description: "object marker (after consonant)"
1345
+ }
1346
+ ],
1347
+ [
1348
+ "\uB97C",
1349
+ {
1350
+ role: "patient",
1351
+ confidence: 0.95,
1352
+ variant: "vowel",
1353
+ description: "object marker (after vowel)"
1354
+ }
1355
+ ],
1356
+ // Topic markers (vowel harmony pair)
1357
+ [
1358
+ "\uC740",
1359
+ {
1360
+ role: "agent",
1361
+ confidence: 0.75,
1362
+ variant: "consonant",
1363
+ description: "topic marker (after consonant)"
1364
+ }
1365
+ ],
1366
+ [
1367
+ "\uB294",
1368
+ {
1369
+ role: "agent",
1370
+ confidence: 0.75,
1371
+ variant: "vowel",
1372
+ description: "topic marker (after vowel)"
1373
+ }
1374
+ ],
1375
+ // Location/time markers
1376
+ ["\uC5D0", { role: "destination", confidence: 0.85, description: "at/to marker" }],
1377
+ ["\uC5D0\uC11C", { role: "source", confidence: 0.8, description: "at/from marker (action location)" }],
1378
+ // Direction/means markers (vowel harmony pair)
1379
+ [
1380
+ "\uB85C",
1381
+ {
1382
+ role: "destination",
1383
+ confidence: 0.85,
1384
+ variant: "vowel",
1385
+ description: "to/by means (after vowel or \u3139)"
1386
+ }
1387
+ ],
1388
+ [
1389
+ "\uC73C\uB85C",
1390
+ {
1391
+ role: "destination",
1392
+ confidence: 0.85,
1393
+ variant: "consonant",
1394
+ description: "to/by means (after consonant)"
1395
+ }
1396
+ ],
1397
+ // And/with markers (vowel harmony pair)
1398
+ [
1399
+ "\uC640",
1400
+ { role: "style", confidence: 0.7, variant: "vowel", description: "and/with (after vowel)" }
1401
+ ],
1402
+ [
1403
+ "\uACFC",
1404
+ {
1405
+ role: "style",
1406
+ confidence: 0.7,
1407
+ variant: "consonant",
1408
+ description: "and/with (after consonant)"
1409
+ }
1410
+ ],
1411
+ // Other markers
1412
+ ["\uC758", { role: "patient", confidence: 0.6, description: "possessive marker" }],
1413
+ ["\uB3C4", { role: "patient", confidence: 0.65, description: "also/too marker" }],
1414
+ ["\uB9CC", { role: "patient", confidence: 0.65, description: "only marker" }],
1415
+ ["\uBD80\uD130", { role: "source", confidence: 0.9, description: "from/since marker" }],
1416
+ ["\uAE4C\uC9C0", { role: "destination", confidence: 0.75, description: "until/to marker" }],
1417
+ ["\uCC98\uB7FC", { role: "manner", confidence: 0.8, description: "like/as marker" }],
1418
+ ["\uBCF4\uB2E4", { role: "source", confidence: 0.75, description: "than marker" }]
1419
+ ]);
1420
+ var KOREAN_EXTRAS = [
1421
+ // Values/Literals
1422
+ { native: "\uCC38", normalized: "true" },
1423
+ { native: "\uAC70\uC9D3", normalized: "false" },
1424
+ { native: "\uB110", normalized: "null" },
1425
+ { native: "\uBBF8\uC815\uC758", normalized: "undefined" },
1426
+ // Positional
1427
+ { native: "\uCCAB\uBC88\uC9F8", normalized: "first" },
1428
+ { native: "\uB9C8\uC9C0\uB9C9", normalized: "last" },
1429
+ { native: "\uB2E4\uC74C", normalized: "next" },
1430
+ { native: "\uC774\uC804", normalized: "previous" },
1431
+ { native: "\uAC00\uC7A5\uAC00\uAE4C\uC6B4", normalized: "closest" },
1432
+ { native: "\uBD80\uBAA8", normalized: "parent" },
1433
+ // Events
1434
+ { native: "\uD074\uB9AD", normalized: "click" },
1435
+ { native: "\uB354\uBE14\uD074\uB9AD", normalized: "dblclick" },
1436
+ { native: "\uBCC0\uACBD", normalized: "change" },
1437
+ { native: "\uC81C\uCD9C", normalized: "submit" },
1438
+ { native: "\uC785\uB825", normalized: "input" },
1439
+ { native: "\uB85C\uB4DC", normalized: "load" },
1440
+ { native: "\uC2A4\uD06C\uB864", normalized: "scroll" },
1441
+ { native: "\uD0A4\uB2E4\uC6B4", normalized: "keydown" },
1442
+ { native: "\uD0A4\uC5C5", normalized: "keyup" },
1443
+ { native: "\uB9C8\uC6B0\uC2A4\uC624\uBC84", normalized: "mouseover" },
1444
+ { native: "\uB9C8\uC6B0\uC2A4\uC544\uC6C3", normalized: "mouseout" },
1445
+ // References (additional forms)
1446
+ { native: "\uB0B4", normalized: "my" },
1447
+ { native: "\uADF8\uAC83\uC758", normalized: "its" },
1448
+ // Conditional event forms (native idioms)
1449
+ { native: "\uD558\uBA74", normalized: "on" },
1450
+ { native: "\uC73C\uBA74", normalized: "on" },
1451
+ { native: "\uBA74", normalized: "on" },
1452
+ { native: "\uD560\uB54C", normalized: "on" },
1453
+ { native: "\uD560 \uB54C", normalized: "on" },
1454
+ { native: "\uC744\uB54C", normalized: "on" },
1455
+ { native: "\uC744 \uB54C", normalized: "on" },
1456
+ { native: "\uD558\uB2C8\uAE4C", normalized: "on" },
1457
+ { native: "\uB2C8\uAE4C", normalized: "on" },
1458
+ // Control flow helpers
1459
+ { native: "\uADF8\uB7EC\uBA74", normalized: "then" },
1460
+ { native: "\uADF8\uB807\uC9C0\uC54A\uC73C\uBA74", normalized: "otherwise" },
1461
+ { native: "\uC911\uB2E8", normalized: "break" },
1462
+ // Logical
1463
+ { native: "\uADF8\uB9AC\uACE0", normalized: "and" },
1464
+ { native: "\uB610\uB294", normalized: "or" },
1465
+ { native: "\uC544\uB2C8", normalized: "not" },
1466
+ { native: "\uC774\uB2E4", normalized: "is" },
1467
+ // Command overrides (ensure correct mapping when profile has multiple meanings)
1468
+ { native: "\uCD94\uAC00", normalized: "add" },
1469
+ // Profile may have this as 'append'
1470
+ // Attached particle forms (native idioms - particle + verb without space)
1471
+ // Object particle 를 (after vowel)
1472
+ { native: "\uB97C\uD1A0\uAE00", normalized: "toggle" },
1473
+ { native: "\uB97C\uC804\uD658", normalized: "toggle" },
1474
+ { native: "\uB97C\uCD94\uAC00", normalized: "add" },
1475
+ { native: "\uB97C\uC81C\uAC70", normalized: "remove" },
1476
+ { native: "\uB97C\uC0AD\uC81C", normalized: "remove" },
1477
+ { native: "\uB97C\uC99D\uAC00", normalized: "increment" },
1478
+ { native: "\uB97C\uAC10\uC18C", normalized: "decrement" },
1479
+ { native: "\uB97C\uD45C\uC2DC", normalized: "show" },
1480
+ { native: "\uB97C\uC228\uAE30\uB2E4", normalized: "hide" },
1481
+ { native: "\uB97C\uC124\uC815", normalized: "set" },
1482
+ // Object particle 을 (after consonant)
1483
+ { native: "\uC744\uD1A0\uAE00", normalized: "toggle" },
1484
+ { native: "\uC744\uC804\uD658", normalized: "toggle" },
1485
+ { native: "\uC744\uCD94\uAC00", normalized: "add" },
1486
+ { native: "\uC744\uC81C\uAC70", normalized: "remove" },
1487
+ { native: "\uC744\uC0AD\uC81C", normalized: "remove" },
1488
+ { native: "\uC744\uC99D\uAC00", normalized: "increment" },
1489
+ { native: "\uC744\uAC10\uC18C", normalized: "decrement" },
1490
+ { native: "\uC744\uD45C\uC2DC", normalized: "show" },
1491
+ { native: "\uC744\uC228\uAE30\uB2E4", normalized: "hide" },
1492
+ { native: "\uC744\uC124\uC815", normalized: "set" },
1493
+ // Time units
1494
+ { native: "\uCD08", normalized: "s" },
1495
+ { native: "\uBC00\uB9AC\uCD08", normalized: "ms" },
1496
+ { native: "\uBD84", normalized: "m" },
1497
+ { native: "\uC2DC\uAC04", normalized: "h" }
1498
+ ];
1499
+ var KOREAN_TIME_UNITS = [
1500
+ { pattern: "\uBC00\uB9AC\uCD08", suffix: "ms", length: 3 },
1501
+ { pattern: "\uC2DC\uAC04", suffix: "h", length: 2 },
1502
+ { pattern: "\uCD08", suffix: "s", length: 1 },
1503
+ { pattern: "\uBD84", suffix: "m", length: 1 }
1504
+ ];
1505
+ var KoreanTokenizer = class extends BaseTokenizer {
1506
+ constructor() {
1507
+ super();
1508
+ this.language = "ko";
1509
+ this.direction = "ltr";
1510
+ this.initializeKeywordsFromProfile(koreanProfile, KOREAN_EXTRAS);
1511
+ this.normalizer = new KoreanMorphologicalNormalizer();
1512
+ }
1513
+ tokenize(input) {
1514
+ const tokens = [];
1515
+ let pos = 0;
1516
+ while (pos < input.length) {
1517
+ if (isWhitespace(input[pos])) {
1518
+ pos++;
1519
+ continue;
1520
+ }
1521
+ if (isSelectorStart(input[pos])) {
1522
+ const modifierToken = this.tryEventModifier(input, pos);
1523
+ if (modifierToken) {
1524
+ tokens.push(modifierToken);
1525
+ pos = modifierToken.position.end;
1526
+ continue;
1527
+ }
1528
+ const selectorToken = this.trySelector(input, pos);
1529
+ if (selectorToken) {
1530
+ tokens.push(selectorToken);
1531
+ pos = selectorToken.position.end;
1532
+ continue;
1533
+ }
1534
+ }
1535
+ if (isQuote(input[pos])) {
1536
+ const stringToken = this.tryString(input, pos);
1537
+ if (stringToken) {
1538
+ tokens.push(stringToken);
1539
+ pos = stringToken.position.end;
1540
+ continue;
1541
+ }
1542
+ }
1543
+ if (isUrlStart(input, pos)) {
1544
+ const urlToken = this.tryUrl(input, pos);
1545
+ if (urlToken) {
1546
+ tokens.push(urlToken);
1547
+ pos = urlToken.position.end;
1548
+ continue;
1549
+ }
1550
+ }
1551
+ if (isDigit(input[pos])) {
1552
+ const numberToken = this.extractKoreanNumber(input, pos);
1553
+ if (numberToken) {
1554
+ tokens.push(numberToken);
1555
+ pos = numberToken.position.end;
1556
+ continue;
1557
+ }
1558
+ }
1559
+ const varToken = this.tryVariableRef(input, pos);
1560
+ if (varToken) {
1561
+ tokens.push(varToken);
1562
+ pos = varToken.position.end;
1563
+ continue;
1564
+ }
1565
+ if (isKorean(input[pos])) {
1566
+ const wordToken = this.extractKoreanWord(input, pos);
1567
+ if (wordToken) {
1568
+ tokens.push(wordToken);
1569
+ pos = wordToken.position.end;
1570
+ continue;
1571
+ }
1572
+ }
1573
+ const multiParticle = this.tryMultiCharParticle(input, pos, MULTI_CHAR_PARTICLES);
1574
+ if (multiParticle) {
1575
+ const metadata = PARTICLE_ROLES.get(multiParticle.value);
1576
+ if (metadata) {
1577
+ tokens.push({
1578
+ ...multiParticle,
1579
+ metadata: {
1580
+ particleRole: metadata.role,
1581
+ particleConfidence: metadata.confidence,
1582
+ particleVariant: metadata.variant
1583
+ }
1584
+ });
1585
+ } else {
1586
+ tokens.push(multiParticle);
1587
+ }
1588
+ pos = multiParticle.position.end;
1589
+ continue;
1590
+ }
1591
+ if (SINGLE_CHAR_PARTICLES.has(input[pos])) {
1592
+ const particle = input[pos];
1593
+ const metadata = PARTICLE_ROLES.get(particle);
1594
+ if (metadata) {
1595
+ tokens.push({
1596
+ ...createToken(particle, "particle", createPosition(pos, pos + 1)),
1597
+ metadata: {
1598
+ particleRole: metadata.role,
1599
+ particleConfidence: metadata.confidence,
1600
+ particleVariant: metadata.variant
1601
+ }
1602
+ });
1603
+ } else {
1604
+ tokens.push(createToken(particle, "particle", createPosition(pos, pos + 1)));
1605
+ }
1606
+ pos++;
1607
+ continue;
1608
+ }
1609
+ if (isAsciiIdentifierChar(input[pos])) {
1610
+ const asciiToken = this.extractAsciiWord(input, pos);
1611
+ if (asciiToken) {
1612
+ tokens.push(asciiToken);
1613
+ pos = asciiToken.position.end;
1614
+ continue;
1615
+ }
1616
+ }
1617
+ pos++;
1618
+ }
1619
+ return new TokenStreamImpl(tokens, "ko");
1620
+ }
1621
+ classifyToken(token) {
1622
+ if (PARTICLES.has(token)) return "particle";
1623
+ if (this.isKeyword(token)) return "keyword";
1624
+ if (token.startsWith("#") || token.startsWith(".") || token.startsWith("[")) return "selector";
1625
+ if (token.startsWith('"') || token.startsWith("'")) return "literal";
1626
+ if (/^\d/.test(token)) return "literal";
1627
+ return "identifier";
1628
+ }
1629
+ /**
1630
+ * Extract a Korean word (sequence of Hangul).
1631
+ * Prioritizes known keywords, then uses particle-based word boundaries.
1632
+ *
1633
+ * Uses morphological normalization to handle verb conjugations.
1634
+ */
1635
+ extractKoreanWord(input, startPos) {
1636
+ const maxKeywordLen = 6;
1637
+ for (let len = Math.min(maxKeywordLen, input.length - startPos); len >= 2; len--) {
1638
+ const candidate = input.slice(startPos, startPos + len);
1639
+ let allKorean = true;
1640
+ for (let i = 0; i < candidate.length; i++) {
1641
+ if (!isKorean(candidate[i])) {
1642
+ allKorean = false;
1643
+ break;
1644
+ }
1645
+ }
1646
+ if (!allKorean) continue;
1647
+ const keywordEntry2 = this.lookupKeyword(candidate);
1648
+ if (keywordEntry2) {
1649
+ return createToken(
1650
+ candidate,
1651
+ "keyword",
1652
+ createPosition(startPos, startPos + len),
1653
+ keywordEntry2.normalized
1654
+ );
1655
+ }
1656
+ const morphToken2 = this.tryMorphKeywordMatch(candidate, startPos, startPos + len);
1657
+ if (morphToken2) return morphToken2;
1658
+ }
1659
+ let pos = startPos;
1660
+ let word = "";
1661
+ while (pos < input.length) {
1662
+ const char = input[pos];
1663
+ const nextChar = pos + 1 < input.length ? input[pos + 1] : "";
1664
+ if (SINGLE_CHAR_PARTICLES.has(char) && word.length > 0) {
1665
+ const isWordBoundary = nextChar === "" || isWhitespace(nextChar) || !isKorean(nextChar) || SINGLE_CHAR_PARTICLES.has(nextChar);
1666
+ if (isWordBoundary) {
1667
+ break;
1668
+ }
1669
+ }
1670
+ let foundMulti = false;
1671
+ for (const particle of MULTI_CHAR_PARTICLES) {
1672
+ if (input.slice(pos, pos + particle.length) === particle && word.length > 0) {
1673
+ const afterParticle = pos + particle.length;
1674
+ const charAfter = afterParticle < input.length ? input[afterParticle] : "";
1675
+ if (charAfter === "" || isWhitespace(charAfter) || !isKorean(charAfter)) {
1676
+ foundMulti = true;
1677
+ break;
1678
+ }
1679
+ }
1680
+ }
1681
+ if (foundMulti) break;
1682
+ if (isKorean(char)) {
1683
+ word += char;
1684
+ pos++;
1685
+ } else {
1686
+ break;
1687
+ }
1688
+ }
1689
+ if (!word) return null;
1690
+ const keywordEntry = this.lookupKeyword(word);
1691
+ if (keywordEntry) {
1692
+ return createToken(word, "keyword", createPosition(startPos, pos), keywordEntry.normalized);
1693
+ }
1694
+ const morphToken = this.tryMorphKeywordMatch(word, startPos, pos);
1695
+ if (morphToken) return morphToken;
1696
+ return createToken(word, "identifier", createPosition(startPos, pos));
1697
+ }
1698
+ /**
1699
+ * Extract an ASCII word (for mixed Korean/English content).
1700
+ */
1701
+ extractAsciiWord(input, startPos) {
1702
+ let pos = startPos;
1703
+ let word = "";
1704
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
1705
+ word += input[pos++];
1706
+ }
1707
+ if (!word) return null;
1708
+ return createToken(word, "identifier", createPosition(startPos, pos));
1709
+ }
1710
+ /**
1711
+ * Extract a number, including Korean time unit suffixes.
1712
+ * Korean time units attach directly without whitespace.
1713
+ */
1714
+ extractKoreanNumber(input, startPos) {
1715
+ return this.tryNumberWithTimeUnits(input, startPos, KOREAN_TIME_UNITS, {
1716
+ allowSign: false,
1717
+ skipWhitespace: false
1718
+ });
1719
+ }
1720
+ };
1721
+ var koreanTokenizer = new KoreanTokenizer();
1722
+
1723
+ // src/languages/ko.ts
1724
+ registerLanguage("ko", koreanTokenizer, koreanProfile);
1725
+ export {
1726
+ koreanProfile,
1727
+ koreanTokenizer
1728
+ };
1729
+ //# sourceMappingURL=ko.js.map