tokenize-is 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Jökull Sólberg
4
+
5
+ Based on Tokenizer by Miðeind ehf.
6
+ Copyright (c) 2020 Miðeind ehf.
7
+ Original author: Vilhjálmur Þorsteinsson
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,114 @@
1
+ # tokenize-is
2
+
3
+ TypeScript tokenizer for Icelandic text. A port of [Miðeind's Tokenizer](https://github.com/mideind/Tokenizer).
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install tokenize-is
9
+ # or
10
+ pnpm add tokenize-is
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ```typescript
16
+ import { tokenize, splitIntoSentences } from "tokenize-is";
17
+
18
+ // Tokenize text
19
+ const tokens = tokenize("Kl. 14:30 komu 100 gestir.");
20
+ for (const token of tokens) {
21
+ if (token.kind === "word") {
22
+ console.log(token.text);
23
+ } else if (token.kind === "number") {
24
+ console.log(token.value); // parsed number
25
+ }
26
+ }
27
+
28
+ // Split into sentences
29
+ const sentences = splitIntoSentences("Fyrst. Síðan.");
30
+ // → ["Fyrst.", "Síðan."]
31
+ ```
32
+
33
+ ## Token Types
34
+
35
+ All tokens have a `kind` discriminator for TypeScript narrowing:
36
+
37
+ | Kind | Description | Parsed Fields |
38
+ | -------------- | ----------------------------------- | -------------------------- |
39
+ | `word` | Words | `text` |
40
+ | `number` | Numbers (Icelandic/English formats) | `value` |
41
+ | `ordinal` | Ordinal numbers (1., XVII.) | `value` |
42
+ | `time` | Time (14:30, kl. tvö) | `hour`, `minute`, `second` |
43
+ | `date` | ISO dates | `year`, `month`, `day` |
44
+ | `dateabs` | Absolute dates (17. júní 1944) | `year`, `month`, `day` |
45
+ | `daterel` | Relative dates (3. janúar) | `month`, `day` |
46
+ | `year` | Four-digit years | `value` |
47
+ | `amount` | Currency amounts (100 kr.) | `value`, `currency` |
48
+ | `currency` | Currency codes/symbols | `iso` |
49
+ | `measurement` | Values with units (5km, 220V) | `value`, `unit` |
50
+ | `percent` | Percentages | `value` |
51
+ | `url` | URLs | `text` |
52
+ | `domain` | Domain names | `text` |
53
+ | `email` | Email addresses | `text` |
54
+ | `hashtag` | Hashtags (#iceland) | `text` |
55
+ | `username` | @mentions | `username` |
56
+ | `numwletter` | Number+letter (14b, 33C) | `value`, `letter` |
57
+ | `telno` | Phone numbers | `cc`, `number` |
58
+ | `molecule` | Chemical formulas (H2O) | `text` |
59
+ | `ssn` | Icelandic kennitala | `value` |
60
+ | `serialnumber` | Serial numbers | `text` |
61
+ | `timestamp` | Date+time combined | `year`..`second` |
62
+ | `punctuation` | Punctuation | `normalized`, `position` |
63
+
64
+ ## Options
65
+
66
+ ```typescript
67
+ tokenize(text, {
68
+ replaceCompositeGlyphs: true, // Normalize Unicode (a + ́ → á)
69
+ includeSentenceMarkers: false, // Add s_begin/s_end tokens
70
+ });
71
+ ```
72
+
73
+ ## Port Fidelity
74
+
75
+ This is a TypeScript port of [Miðeind's Tokenizer](https://github.com/mideind/Tokenizer) (MIT licensed).
76
+
77
+ ### Supported
78
+
79
+ - All 30 token types from the original
80
+ - Sentence boundary detection with abbreviation awareness
81
+ - Unicode normalization (composite glyphs)
82
+ - Icelandic number formats (1.234,56)
83
+ - Spelled-out time expressions (hálftvö → 1:30)
84
+ - ~100 Icelandic abbreviations
85
+ - 70+ SI units, 18+ currencies
86
+ - Kennitala (SSN) validation with checksum
87
+
88
+ ### Not Yet Implemented
89
+
90
+ - `detokenize()` - reconstruct text from tokens
91
+ - `correct_spaces()` - fix spacing between tokens
92
+ - `paragraphs()` / `mark_paragraphs()` - paragraph handling
93
+ - HTML entity unescaping (á → á)
94
+ - Full abbreviation list (300+ in original vs ~100 here)
95
+
96
+ ### Design Differences
97
+
98
+ - ESM-only (no CommonJS)
99
+ - Returns arrays instead of generators
100
+ - Discriminated unions instead of numeric token codes
101
+ - Zero runtime dependencies
102
+
103
+ ## Development
104
+
105
+ ```bash
106
+ pnpm install
107
+ pnpm test # Run tests
108
+ pnpm build # Build with tsdown
109
+ pnpm check # Lint + format + typecheck
110
+ ```
111
+
112
+ ## License
113
+
114
+ MIT - same as the original Tokenizer.
@@ -0,0 +1,253 @@
1
+ //#region src/types.d.ts
2
+ /**
3
+ * Punctuation position types for spacing rules
4
+ */
5
+ type PunctuationType = "left" | "center" | "right" | "none";
6
+ /**
7
+ * Discriminated union for all token types.
8
+ * Use `token.kind` to narrow the type.
9
+ */
10
+ type Token = WordToken | NumberToken | OrdinalToken | TimeToken | DateToken | DateAbsToken | DateRelToken | YearToken | AmountToken | CurrencyToken | MeasurementToken | PercentToken | PunctuationToken | UrlToken | DomainToken | EmailToken | HashtagToken | UsernameToken | NumberWithLetterToken | TelnoToken | MoleculeToken | SsnToken | SerialNumberToken | TimestampToken | TimestampAbsToken | TimestampRelToken | CompanyToken | PersonToken | EntityToken | UnknownToken | SentenceBeginToken | SentenceEndToken | SentenceSplitToken;
11
+ interface WordToken {
12
+ kind: "word";
13
+ text: string;
14
+ }
15
+ interface NumberToken {
16
+ kind: "number";
17
+ text: string;
18
+ value: number;
19
+ }
20
+ interface OrdinalToken {
21
+ kind: "ordinal";
22
+ text: string;
23
+ value: number;
24
+ }
25
+ interface TimeToken {
26
+ kind: "time";
27
+ text: string;
28
+ hour: number;
29
+ minute: number;
30
+ second: number;
31
+ }
32
+ interface DateToken {
33
+ kind: "date";
34
+ text: string;
35
+ year: number;
36
+ month: number;
37
+ day: number;
38
+ }
39
+ interface DateAbsToken {
40
+ kind: "dateabs";
41
+ text: string;
42
+ year: number;
43
+ month: number;
44
+ day: number;
45
+ }
46
+ interface DateRelToken {
47
+ kind: "daterel";
48
+ text: string;
49
+ year: number;
50
+ month: number;
51
+ day: number;
52
+ }
53
+ interface YearToken {
54
+ kind: "year";
55
+ text: string;
56
+ value: number;
57
+ }
58
+ interface AmountToken {
59
+ kind: "amount";
60
+ text: string;
61
+ value: number;
62
+ currency: string;
63
+ }
64
+ interface CurrencyToken {
65
+ kind: "currency";
66
+ text: string;
67
+ iso: string;
68
+ }
69
+ interface MeasurementToken {
70
+ kind: "measurement";
71
+ text: string;
72
+ value: number;
73
+ unit: string;
74
+ }
75
+ interface PercentToken {
76
+ kind: "percent";
77
+ text: string;
78
+ value: number;
79
+ }
80
+ interface PunctuationToken {
81
+ kind: "punctuation";
82
+ text: string;
83
+ normalized: string;
84
+ position: PunctuationType;
85
+ }
86
+ interface UrlToken {
87
+ kind: "url";
88
+ text: string;
89
+ }
90
+ interface DomainToken {
91
+ kind: "domain";
92
+ text: string;
93
+ }
94
+ interface EmailToken {
95
+ kind: "email";
96
+ text: string;
97
+ }
98
+ interface HashtagToken {
99
+ kind: "hashtag";
100
+ text: string;
101
+ }
102
+ interface UsernameToken {
103
+ kind: "username";
104
+ text: string;
105
+ username: string;
106
+ }
107
+ interface NumberWithLetterToken {
108
+ kind: "numwletter";
109
+ text: string;
110
+ value: number;
111
+ letter: string;
112
+ }
113
+ interface TelnoToken {
114
+ kind: "telno";
115
+ text: string;
116
+ cc: string;
117
+ number: string;
118
+ }
119
+ interface MoleculeToken {
120
+ kind: "molecule";
121
+ text: string;
122
+ }
123
+ interface SsnToken {
124
+ kind: "ssn";
125
+ text: string;
126
+ value: string;
127
+ }
128
+ interface SerialNumberToken {
129
+ kind: "serialnumber";
130
+ text: string;
131
+ }
132
+ interface TimestampToken {
133
+ kind: "timestamp";
134
+ text: string;
135
+ year: number;
136
+ month: number;
137
+ day: number;
138
+ hour: number;
139
+ minute: number;
140
+ second: number;
141
+ }
142
+ interface TimestampAbsToken {
143
+ kind: "timestampabs";
144
+ text: string;
145
+ year: number;
146
+ month: number;
147
+ day: number;
148
+ hour: number;
149
+ minute: number;
150
+ second: number;
151
+ }
152
+ interface TimestampRelToken {
153
+ kind: "timestamprel";
154
+ text: string;
155
+ year: number;
156
+ month: number;
157
+ day: number;
158
+ hour: number;
159
+ minute: number;
160
+ second: number;
161
+ }
162
+ interface CompanyToken {
163
+ kind: "company";
164
+ text: string;
165
+ }
166
+ interface PersonToken {
167
+ kind: "person";
168
+ text: string;
169
+ }
170
+ interface EntityToken {
171
+ kind: "entity";
172
+ text: string;
173
+ }
174
+ interface UnknownToken {
175
+ kind: "unknown";
176
+ text: string;
177
+ }
178
+ interface SentenceBeginToken {
179
+ kind: "s_begin";
180
+ text: null;
181
+ }
182
+ interface SentenceEndToken {
183
+ kind: "s_end";
184
+ text: null;
185
+ }
186
+ interface SentenceSplitToken {
187
+ kind: "s_split";
188
+ text: null;
189
+ }
190
+ /**
191
+ * Options for the tokenize function
192
+ */
193
+ interface TokenizeOptions {
194
+ /**
195
+ * Whether to replace composite Unicode glyphs (e.g., a + combining accent → á)
196
+ * @default true
197
+ */
198
+ replaceCompositeGlyphs?: boolean;
199
+ /**
200
+ * Whether to include sentence boundary markers in output
201
+ * @default false
202
+ */
203
+ includeSentenceMarkers?: boolean;
204
+ }
205
+ //#endregion
206
+ //#region src/tokenize.d.ts
207
+ /**
208
+ * Tokenize Icelandic text into an array of tokens.
209
+ *
210
+ * @param text - The text to tokenize
211
+ * @param options - Tokenization options
212
+ * @returns Array of tokens
213
+ *
214
+ * @example
215
+ * ```ts
216
+ * const tokens = tokenize("Þetta er próf.");
217
+ * // → [word("Þetta"), word("er"), word("próf"), punctuation(".")]
218
+ *
219
+ * // With sentence markers:
220
+ * const tokens = tokenize("Þetta er próf.", { includeSentenceMarkers: true });
221
+ * // → [s_begin, word("Þetta"), word("er"), word("próf"), punctuation("."), s_end]
222
+ * ```
223
+ */
224
+ declare function tokenize(text: string, options?: TokenizeOptions): Token[];
225
+ //#endregion
226
+ //#region src/split-sentences.d.ts
227
+ /**
228
+ * Split text into sentences
229
+ *
230
+ * A higher-level function that returns sentence strings rather than tokens.
231
+ */
232
+ /**
233
+ * Split Icelandic text into an array of sentence strings.
234
+ *
235
+ * @param text - The text to split
236
+ * @returns Array of sentence strings
237
+ *
238
+ * @example
239
+ * ```ts
240
+ * const sentences = splitIntoSentences("Þetta er fyrsta setning. Þetta er önnur.");
241
+ * // → ["Þetta er fyrsta setning.", "Þetta er önnur."]
242
+ * ```
243
+ */
244
+ declare function splitIntoSentences(text: string): string[];
245
+ //#endregion
246
+ //#region src/pipeline/lexer.d.ts
247
+ /**
248
+ * Replace composite Unicode glyphs with single characters
249
+ */
250
+ declare function normalizeUnicode(text: string): string;
251
+ //#endregion
252
+ export { type AmountToken, type CompanyToken, type CurrencyToken, type DateAbsToken, type DateRelToken, type DateToken, type DomainToken, type EmailToken, type EntityToken, type HashtagToken, type MeasurementToken, type MoleculeToken, type NumberToken, type NumberWithLetterToken, type OrdinalToken, type PercentToken, type PersonToken, type PunctuationToken, type PunctuationType, type SentenceBeginToken, type SentenceEndToken, type SentenceSplitToken, type SerialNumberToken, type SsnToken, type TelnoToken, type TimeToken, type TimestampAbsToken, type TimestampRelToken, type TimestampToken, type Token, type TokenizeOptions, type UnknownToken, type UrlToken, type UsernameToken, type WordToken, type YearToken, normalizeUnicode, splitIntoSentences, tokenize };
253
+ //# sourceMappingURL=index.d.mts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.mts","names":[],"sources":["../src/types.ts","../src/tokenize.ts","../src/split-sentences.ts","../src/pipeline/lexer.ts"],"mappings":";;AAGA;;KAAY,eAAA;;;AAMZ;;KAAY,KAAA,GACR,SAAA,GACA,WAAA,GACA,YAAA,GACA,SAAA,GACA,SAAA,GACA,YAAA,GACA,YAAA,GACA,SAAA,GACA,WAAA,GACA,aAAA,GACA,gBAAA,GACA,YAAA,GACA,gBAAA,GACA,QAAA,GACA,WAAA,GACA,UAAA,GACA,YAAA,GACA,aAAA,GACA,qBAAA,GACA,UAAA,GACA,aAAA,GACA,QAAA,GACA,iBAAA,GACA,cAAA,GACA,iBAAA,GACA,iBAAA,GACA,YAAA,GACA,WAAA,GACA,WAAA,GACA,YAAA,GACA,kBAAA,GACA,gBAAA,GACA,kBAAA;AAAA,UAEa,SAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,WAAA;EACf,IAAA;EACA,IAAA;EACA,KAAA;AAAA;AAAA,UAGe,YAAA;EACf,IAAA;EACA,IAAA;EACA,KAAA;AAAA;AAAA,UAGe,SAAA;EACf,IAAA;EACA,IAAA;EACA,IAAA;EACA,MAAA;EACA,MAAA;AAAA;AAAA,UAGe,SAAA;EACf,IAAA;EACA,IAAA;EACA,IAAA;EACA,KAAA;EACA,GAAA;AAAA;AAAA,UAGe,YAAA;EACf,IAAA;EACA,IAAA;EACA,IAAA;EACA,KAAA;EACA,GAAA;AAAA;AAAA,UAGe,YAAA;EACf,IAAA;EACA,IAAA;EACA,IAAA;EACA,KAAA;EACA,GAAA;AAAA;AAAA,UAGe,SAAA;EACf,IAAA;EACA,IAAA;EACA,KAAA;AAAA;AAAA,UAGe,WAAA;EACf,IAAA;EACA,IAAA;EACA,KAAA;EACA,QAAA;AAAA;AAAA,UAGe,aAAA;EACf,IAAA;EACA,IAAA;EACA,GAAA;AAAA;AAAA,UAGe,gBAAA;EACf,IAAA;EACA,IAAA;EACA,KAAA;EACA,IAAA;AAAA;AAAA,UAGe,YAAA;EACf,IAAA;EACA,IAAA;EACA,KAAA;AAAA;AAAA,UAGe,gBAAA;EACf,IAAA;EACA,IAAA;EACA,UAAA;EACA,QAAA,EAAU,eAAA;AAAA;AAAA,UAGK,QAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,WAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,UAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,YAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,aAAA;EACf,IAAA;EACA,IAAA;EACA,QAAA;AAAA;AAAA,UAGe,qBAAA;EACf,IAAA;EACA,IAAA;EACA,KAAA;EACA,MAAA;AAAA;AAAA,UAGe,UAAA;EACf,IAAA;EACA,IAAA;EACA,EAAA;EACA,MAAA;AAAA;AAAA,UAGe,aAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,QAAA;EACf,IAAA;EACA,IAAA;EACA,KAAA;AAAA;AAAA,UAGe,iBAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,cAAA;EACf,IAAA;EACA,IAAA;EACA,IAAA;EACA,KAAA;EACA,GAAA;EACA,IAAA;EACA,MAAA;EACA,MAAA;AAAA;AAAA,UAGe,iBAAA;EACf,IAAA;EACA,IAAA;EACA,IAAA;EACA,KAAA;EACA,GAAA;EACA,IAAA;EACA,MAAA;EACA,MAAA;AAAA;AAAA,UAGe,iBAAA;EACf,IAAA;EACA,IAAA;EACA,IAAA;EACA,KAAA;EACA,GAAA;EACA,IAAA;EACA,MAAA;EACA,MAAA;AAAA;AAAA,UAGe,YAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,WAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,WAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,YAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,kBAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,gBAAA;EACf,IAAA;EACA,IAAA;AAAA;AAAA,UAGe,kBAAA;EACf,IAAA;EACA,IAAA;AAAA;;;;UAMe,eAAA;EAzIV;AAGP;;;EA2IE,sBAAA;EA1IA;;;;EAgJA,sBAAA;AAAA;;;AArQF;;;;;;;;;;;;;;;;;AAAA,iBCoBgB,QAAA,CAAS,IAAA,UAAc,OAAA,GAAS,eAAA,GAAuB,KAAA;;;;AD1BvE;;;;;AAMA;;;;;;;;;;;iBEYgB,kBAAA,CAAmB,IAAA;;;;;;iBCsCnB,gBAAA,CAAiB,IAAA"}
package/dist/index.mjs ADDED
@@ -0,0 +1,2 @@
1
+ const e={á:`á`,é:`é`,í:`í`,ó:`ó`,ú:`ú`,ý:`ý`,Á:`Á`,É:`É`,Í:`Í`,Ó:`Ó`,Ú:`Ú`,Ý:`Ý`,ä:`ä`,ë:`ë`,ö:`ö`,ü:`ü`,Ä:`Ä`,Ë:`Ë`,Ö:`Ö`,Ü:`Ü`,"­":``,"​":``,"":``},t=`-–—`,n=`([„‚«#$€£¥₽<`;``+t,n+``;const r=new Set(`([„‚«#$€£¥₽<"*•&+=@©|.,:;)]!%‰?»“’‛‘…>°^/±'´~\\-–—`),i=new Set(n),a=new Set(`.,:;)]!%‰?»“’‛‘…>°`),o=new Set(`^/±'´~\\-–—`),s=`'‚‛’´`,c=`"“„”«»`,l=new Set([`.`,`?`,`!`,`…`]),u=new Set([`)`,`]`,`“`,`»`,`”`,`’`,`"`,`[…]`]),d=new Set([`?`,`!`,`…`]),f=new Set(`0123456789`),p=new Set([`+`,`-`]),m={janúar:1,janúars:1,febrúar:2,febrúars:2,mars:3,apríl:4,apríls:4,maí:5,maís:5,júní:6,júnís:6,júlí:7,júlís:7,ágúst:8,ágústs:8,september:9,septembers:9,október:10,októbers:10,nóvember:11,nóvembers:11,desember:12,desembers:12,"jan.":1,"feb.":2,"mar.":3,"apr.":4,"jún.":6,"júl.":7,"ág.":8,"ágú.":8,"sep.":9,"sept.":9,"okt.":10,"nóv.":11,"des.":12,jan:1,feb:2,mar:3,apr:4,jún:6,júl:7,ág:8,ágú:8,sep:9,sept:9,okt:10,nóv:11,des:12},h=new Set([`Ágúst`]),g=[0,31,29,31,30,31,30,31,31,30,31,30,31],_=new Set([`kl`,`kl.`,`klukkan`]),ee={eitt:[1,0,0],tvö:[2,0,0],þrjú:[3,0,0],fjögur:[4,0,0],fimm:[5,0,0],sex:[6,0,0],sjö:[7,0,0],átta:[8,0,0],níu:[9,0,0],tíu:[10,0,0],ellefu:[11,0,0],tólf:[12,0,0],hálfeitt:[12,30,0],hálftvö:[1,30,0],hálfþrjú:[2,30,0],hálffjögur:[3,30,0],hálffimm:[4,30,0],hálfsex:[5,30,0],hálfsjö:[6,30,0],hálfátta:[7,30,0],hálfníu:[8,30,0],hálftíu:[9,30,0],hálfellefu:[10,30,0],hálftólf:[11,30,0]},v=new Set([`e.Kr`,`e.Kr.`]),y=new Set([`f.Kr`,`f.Kr.`]);new Set([...v,...y]);const b={$:`USD`,"€":`EUR`,"£":`GBP`,"¥":`JPY`,"₽":`RUB`},x=new Set([`ISK`,`DKK`,`NOK`,`SEK`,`GBP`,`USD`,`EUR`,`CAD`,`AUD`,`CHF`,`JPY`,`PLN`,`RUB`,`CZK`,`INR`,`CNY`,`RMB`,`HKD`,`NZD`,`SGD`,`MXN`,`ZAR`]),S={"kr.":1,kr:1,krónur:1,"þ.kr.":1e3,"þ.kr":1e3,"þús.kr.":1e3,"þús.kr":1e3,"m.kr.":1e6,"m.kr":1e6,"mkr.":1e6,mkr:1e6,"millj.kr.":1e6,"millj.kr":1e6,"ma.kr.":1e9,"ma.kr":1e9,"mlja.kr.":1e9,"mlja.kr":1e9},C={m:[`m`,1],mm:[`m`,.001],μm:[`m`,1e-6],cm:[`m`,.01],sm:[`m`,.01],km:[`m`,1e3],ft:[`m`,.3048],mi:[`m`,1609.34],"m²":[`m²`,1],fm:[`m²`,1],"km²":[`m²`,1e6],"cm²":[`m²`,.01],ha:[`m²`,1e4],"m³":[`m³`,1],"cm³":[`m³`,1e-6],"km³":[`m³`,1e9],l:[`m³`,.001],ltr:[`m³`,.001],dl:[`m³`,1e-4],cl:[`m³`,1e-5],ml:[`m³`,1e-6],gal:[`m³`,.00378541],bbl:[`m³`,.158987294928],K:[`K`,1],"°K":[`K`,1],g:[`kg`,.001],gr:[`kg`,.001],kg:[`kg`,1],t:[`kg`,1e3],mg:[`kg`,1e-6],μg:[`kg`,1e-9],tn:[`kg`,1e3],lb:[`kg`,.453592],s:[`s`,1],ms:[`s`,.001],μs:[`s`,1e-6],klst:[`s`,3600],mín:[`s`,60],N:[`N`,1],kN:[`N`,1e3],Nm:[`J`,1],J:[`J`,1],kJ:[`J`,1e3],MJ:[`J`,1e6],GJ:[`J`,1e9],TJ:[`J`,0xe8d4a51000],kWh:[`J`,36e5],MWh:[`J`,36e8],kWst:[`J`,36e5],MWst:[`J`,36e8],kcal:[`J`,4184],cal:[`J`,4.184],W:[`W`,1],mW:[`W`,.001],kW:[`W`,1e3],MW:[`W`,1e6],GW:[`W`,1e9],TW:[`W`,0xe8d4a51000],V:[`V`,1],mV:[`V`,.001],kV:[`V`,1e3],A:[`A`,1],mA:[`A`,.001],Hz:[`Hz`,1],kHz:[`Hz`,1e3],MHz:[`Hz`,1e6],GHz:[`Hz`,1e9],Pa:[`Pa`,1],hPa:[`Pa`,100],"°":[`°`,1],"%":[`%`,1],"‰":[`‰`,.1]},w=new Set(Object.keys(C));function T(){let e=Object.keys(C).sort((e,t)=>t.length-e.length).map(e=>{let t=e.replace(/[.*+?^${}()|[\]\\]/g,`\\$&`);return e[e.length-1].match(/[a-zA-Z]/)?`${t}(?!\\w)`:t});return RegExp(`^(${e.join(`|`)})`,`u`)}const E=T();function te(){let e=Object.keys(b).sort((e,t)=>t.length-e.length).map(e=>e.replace(/[.*+?^${}()|[\]\\]/g,`\\$&`));return RegExp(`^(${e.join(`|`)})`,`u`)}te();function ne(){let e=[...Object.keys(C),...Object.keys(b)].sort((e,t)=>t.length-e.length).map(e=>{let t=e.replace(/[.*+?^${}()|[\]\\]/g,`\\$&`);return e[e.length-1].match(/[a-zA-Z]/)?`${t}(?!\\w)`:t});return RegExp(`(${e.join(`|`)})$`)}ne();const re=/^(\d{1,2}):(\d{2}):(\d{2}),(\d{2})(?!\d)/,ie=/^(\d{1,2}):(\d{2}):(\d{2})(?!\d)/,ae=/^(\d{1,2}):(\d{2})(?!\d)/,oe=/^(\d{4})[-/](\d{2})[-/](\d{2})(?!\d)/,se=/^(\d{1,2})[./-](\d{1,2})[./-](\d{2,4})(?!\d)/,ce=/^(\d{2})\.(\d{2})(?!\d)/,D=/^(\d{2})[.-](\d{4})(?!\d)/,O=/^(\d+)([a-zA-Z])(?!\w)/u,k=/^[^@\s]+@[^@\s]+(\.[^@\s.,/:;"()%#!?]+)+/,A=/^(https?:\/\/|ftp:\/\/|file:\/\/|mailto:|www\.)/i,j=/^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z]{2,})+/,M=/^#\w+/u,N=/^@[0-9a-z_]+/i,P=/^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$/;function F(e){return P.test(e)}const I=/^\+(\d{1,3})[-\s]?(\d{3})[-\s]?(\d{4})(?!\d)/,L=/^[A-Z][a-z]?\d*(?:[A-Z][a-z]?\d*)+$/,R=/^(\d{6})-?(\d{4})(?!\d)/,z=/^(\d+)-(\d+)(?:-\d+)*(?!\d)/,B=/^(\d{4})[-/](\d{2})[-/](\d{2})[T\s](\d{1,2}):(\d{2}):(\d{2})(?!\d)/,V=/^(\d{4})[-/](\d{2})[-/](\d{2})[T\s](\d{1,2}):(\d{2})(?!\d)/,H=/^\[company[:\s]([^\]]+)\]/i,U=/^\[person[:\s]([^\]]+)\]/i,W=/^\[entity[:\s]([^\]]+)\]/i,G=new Set(`H.He.Li.Be.B.C.N.O.F.Ne.Na.Mg.Al.Si.P.S.Cl.Ar.K.Ca.Sc.Ti.V.Cr.Mn.Fe.Co.Ni.Cu.Zn.Ga.Ge.As.Se.Br.Kr.Rb.Sr.Y.Zr.Nb.Mo.Tc.Ru.Rh.Pd.Ag.Cd.In.Sn.Sb.Te.I.Xe.Cs.Ba.La.Ce.Pr.Nd.Pm.Sm.Eu.Gd.Tb.Dy.Ho.Er.Tm.Yb.Lu.Hf.Ta.W.Re.Os.Ir.Pt.Au.Hg.Tl.Pb.Bi.Po.At.Rn.Fr.Ra.Ac.Th.Pa.U.Np.Pu.Am.Cm.Bk.Cf.Es.Fm.Md.No.Lr.Rf.Db.Sg.Bh.Hs.Mt.Ds.Rg.Cn.Nh.Fl.Mc.Lv.Ts.Og`.split(`.`));function le(e){if(e.length!==10||!/^\d{10}$/.test(e))return!1;let t=parseInt(e[0],10),n=parseInt(e[1],10),r=parseInt(e[2],10),i=parseInt(e[3],10),a=parseInt(e[4],10),o=parseInt(e[5],10),s=parseInt(e[6],10),c=parseInt(e[7],10),l=parseInt(e[8],10),u=parseInt(e[9],10);if(u!==9&&u!==0)return!1;let d=t*10+n,f=r*10+i,p=d>40?d-40:d;if(p<1||p>31||f<1||f>12)return!1;let m=(3*t+2*n+7*r+6*i+5*a+4*o+3*s+2*c)%11,h=m===0?0:11-m;return h===10?!1:l===h}function K(e){if(!L.test(e))return!1;let t=/([A-Z][a-z]?)(\d*)/g,n,r=0;for(;(n=t.exec(e))!==null;){let e=n[1];if(!G.has(e))return!1;r++}return r>=2}function q(t){let n=t;for(let[t,r]of Object.entries(e))n=n.replaceAll(t,r);return n}function J(e,t,n){return!(e<1776||e>2100||t<1||t>12||n<1||n>g[t]||t===2&&n===29&&!(e%4==0&&e%100!=0||e%400==0))}function ue(e){return i.has(e)?`left`:a.has(e)?`right`:o.has(e)?`none`:`center`}function Y(e,t){let n=t??e;return{kind:`punctuation`,text:e,normalized:n,position:n.length===1?ue(n):`center`}}function X(e){let t=B.exec(e);if(t){let e=parseInt(t[1],10),n=parseInt(t[2],10),r=parseInt(t[3],10),i=parseInt(t[4],10),a=parseInt(t[5],10),o=parseInt(t[6],10);if(J(e,n,r)&&i>=0&&i<24&&a>=0&&a<60&&o>=0&&o<60)return[{kind:`timestamp`,text:t[0],year:e,month:n,day:r,hour:i,minute:a,second:o},t[0].length]}if(t=V.exec(e),t){let e=parseInt(t[1],10),n=parseInt(t[2],10),r=parseInt(t[3],10),i=parseInt(t[4],10),a=parseInt(t[5],10);if(J(e,n,r)&&i>=0&&i<24&&a>=0&&a<60)return[{kind:`timestamp`,text:t[0],year:e,month:n,day:r,hour:i,minute:a,second:0},t[0].length]}if(t=re.exec(e),t){let e=parseInt(t[1],10),n=parseInt(t[2],10),r=parseInt(t[3],10);if(e>=0&&e<24&&n>=0&&n<60&&r>=0&&r<60)return[{kind:`time`,text:t[0],hour:e,minute:n,second:r},t[0].length]}if(t=ie.exec(e),t){let e=parseInt(t[1],10),n=parseInt(t[2],10),r=parseInt(t[3],10);if(e>=0&&e<24&&n>=0&&n<60&&r>=0&&r<60)return[{kind:`time`,text:t[0],hour:e,minute:n,second:r},t[0].length]}if(t=ae.exec(e),t){let e=parseInt(t[1],10),n=parseInt(t[2],10);if(e>=0&&e<24&&n>=0&&n<60)return[{kind:`time`,text:t[0],hour:e,minute:n,second:0},t[0].length]}if(t=oe.exec(e),t){let e=parseInt(t[1],10),n=parseInt(t[2],10),r=parseInt(t[3],10);if(J(e,n,r))return[{kind:`date`,text:t[0],year:e,month:n,day:r},t[0].length]}if(t=R.exec(e),t){let e=t[1]+t[2];if(le(e))return[{kind:`ssn`,text:t[0],value:e},t[0].length]}if(t=z.exec(e),t)return[{kind:`serialnumber`,text:t[0]},t[0].length];let n=e.match(/^(\d{7})(?!\d)/);if(n){let e=n[1];return[{kind:`telno`,text:n[0],cc:``,number:e},n[0].length]}if(t=se.exec(e),t){let e=parseInt(t[1],10),n=parseInt(t[2],10),r=parseInt(t[3],10);if(r<=99&&(r+=r>50?1900:2e3),n>12&&e<=12&&([e,n]=[n,e]),J(r,n,e))return[{kind:`date`,text:t[0],year:r,month:n,day:e},t[0].length]}if(t=ce.exec(e),t){let e=parseInt(t[1],10),n=parseInt(t[2],10);if(n>=1&&n<=12&&e>=1&&e<=g[n])return[{kind:`daterel`,text:t[0],year:0,month:n,day:e},t[0].length]}if(t=D.exec(e),t){let e=parseInt(t[1],10),n=parseInt(t[2],10);if(n>=1776&&n<=2100&&e>=1&&e<=12)return[{kind:`daterel`,text:t[0],year:n,month:e,day:0},t[0].length]}if(t=O.exec(e),t){let e=t[2];if(!w.has(e)){let n=parseInt(t[1],10);return[{kind:`numwletter`,text:t[0],value:n,letter:e},t[0].length]}}let r=e.match(/^([-+]?\d+(?:\.\d{3})*(?:,\d+)?)/);if(r){let t=r[1],n=e.slice(t.length),i=E.exec(n);if(i){let e=i[1],n=t+e,r=parseFloat(t.replace(/\./g,``).replace(`,`,`.`));if(e in b)return[{kind:`amount`,text:n,value:r,currency:b[e]},n.length];let[a]=C[e];return e===`%`||e===`‰`?[{kind:`percent`,text:n,value:r},n.length]:[{kind:`measurement`,text:n,value:r,unit:a},n.length]}}let i=e.match(/^([-+]?\d+(?:\.\d{3})*(?:,\d+)?)(?!\d)/);if(i&&i[1].includes(`,`)){let e=parseFloat(i[1].replace(/\./g,``).replace(`,`,`.`));return[{kind:`number`,text:i[1],value:e},i[1].length]}let a=e.match(/^([-+]?\d+(?:,\d{3})*(?:\.\d+)?)(?!\d)/);if(a&&(a[1].includes(`,`)||a[1].includes(`.`))){let e=parseFloat(a[1].replace(/,/g,``));return[{kind:`number`,text:a[1],value:e},a[1].length]}let o=e.match(/^([-+]?\d+)(?!\d)/);if(o){let e=parseInt(o[1],10);return[{kind:`number`,text:o[1],value:e},o[1].length]}return[{kind:`unknown`,text:e[0]},1]}function*de(e){if(!e){yield{kind:`s_split`,text:null};return}if(/^[\p{L}]+$/u.test(e)||w.has(e)){if(K(e)){yield{kind:`molecule`,text:e};return}yield{kind:`word`,text:e};return}if(e.startsWith(`+`)&&e.length>=10){let t=I.exec(e);if(t){let n=t[1],r=t[2]+t[3];if(yield{kind:`telno`,text:t[0],cc:n,number:r},e=e.slice(t[0].length),!e)return}}if(e.length>1&&p.has(e[0])&&f.has(e[1])){let[t,n]=X(e);if(yield t,e=e.slice(n),!e)return}if(e.length>1&&`-–`.includes(e[0])&&/\p{L}/u.test(e[1])){let t=2;for(;t<e.length&&/\p{L}/u.test(e[t]);)t++;let n=e.slice(0,t);(n.slice(1).toLowerCase()===n.slice(1)||t>2&&n.slice(1).toUpperCase()===n.slice(1))&&(yield{kind:`word`,text:n},e=e.slice(t))}if(e.length>=3){if(c.includes(e[0])&&c.includes(e[e.length-1])){let t=e.slice(1,-1);if(/^[\p{L}]+$/u.test(t)){yield Y(e[0],`„`),yield{kind:`word`,text:t},yield Y(e[e.length-1],`“`);return}}if(s.includes(e[0])&&s.includes(e[e.length-1])){let t=e.slice(1,-1);if(/^[\p{L}]+$/u.test(t)){yield Y(e[0],`‚`),yield{kind:`word`,text:t},yield Y(e[e.length-1],`’`);return}}}for(e.length>1&&(c.includes(e[0])?(yield Y(e[0],`„`),e=e.slice(1)):s.includes(e[0])&&(yield Y(e[0],`‚`),e=e.slice(1)));e;){for(;e&&r.has(e[0]);){if(e.startsWith(`[`)){let t=H.exec(e);if(t){yield{kind:`company`,text:t[1]},e=e.slice(t[0].length);continue}let n=U.exec(e);if(n){yield{kind:`person`,text:n[1]},e=e.slice(n[0].length);continue}let r=W.exec(e);if(r){yield{kind:`entity`,text:r[1]},e=e.slice(r[0].length);continue}}if(e.startsWith(`[...]`)){yield Y(`[...]`,`[…]`),e=e.slice(5);continue}if(e.startsWith(`[…]`)){yield Y(`[…]`),e=e.slice(3);continue}if(e.startsWith(`...`)){let t=`...`,n=e.slice(3);for(;n.startsWith(`.`);)t+=`.`,n=n.slice(1);yield Y(t,`…`),e=n;continue}if(e.startsWith(`…`)){yield Y(`…`),e=e.slice(1);continue}if(e===`,,`){yield Y(`,,`,`,`),e=``;continue}if(e.startsWith(`,,`)){yield Y(`,,`,`„`),e=e.slice(2);continue}if(e===`[[`||e===`]]`){yield Y(e),e=``;continue}if(t.includes(e[0])){yield Y(e[0],`-`),e=e.slice(1);continue}if(c.includes(e[0])){yield Y(e[0],`“`),e=e.slice(1);continue}if(s.includes(e[0])){yield Y(e[0],`’`),e=e.slice(1);continue}if(e.startsWith(`#`)&&e.length>1){let t=M.exec(e);if(t){/^#\d+$/.test(t[0])?yield{kind:`ordinal`,text:t[0],value:parseInt(t[0].slice(1),10)}:yield{kind:`hashtag`,text:t[0]},e=e.slice(t[0].length);continue}}if(e.startsWith(`@`)&&e.length>1){let t=N.exec(e);if(t){yield{kind:`username`,text:t[0],username:t[0].slice(1)},e=e.slice(t[0].length);continue}}if(e.startsWith(`+`)&&e.length>1&&f.has(e[1])){let t=I.exec(e);if(t){let n=t[1],r=t[2]+t[3];yield{kind:`telno`,text:t[0],cc:n,number:r},e=e.slice(t[0].length);continue}}yield Y(e[0]),e=e.slice(1)}if(!e)break;if(e.includes(`@`)){let t=k.exec(e);if(t){yield{kind:`email`,text:t[0]},e=e.slice(t[0].length);continue}}if(A.test(e)){let t=e,n=``;for(;t&&a.has(t[t.length-1]);)n=t[t.length-1]+n,t=t.slice(0,-1);yield{kind:`url`,text:t},e=n;continue}if(e.length>=4&&/^[a-zA-Z0-9]/.test(e)&&e.includes(`.`)){let t=j.exec(e);if(t){let n=t[0],i=e.slice(n.length);for(;n&&r.has(n[n.length-1]);)i=n[n.length-1]+i,n=n.slice(0,-1);if(n.includes(`.`)){yield{kind:`domain`,text:n},e=i;continue}}}if(f.has(e[0])||p.has(e[0])&&e.length>1&&f.has(e[1])){let[t,n]=X(e);if(yield t,e=e.slice(n),e){let t=E.exec(e);t&&(yield{kind:`word`,text:t[1]},e=e.slice(t[1].length))}continue}if(/^\p{L}/u.test(e)){let t=1,n=new Set([`.`,`'`,`'`,`´`,`'`,`-`,`–`]),r=new Set([`'`,`²`,`³`]);for(;t<e.length;)if(/\p{L}/u.test(e[t]))t++;else if(f.has(e[t]))t++;else if(n.has(e[t])&&t+1<e.length&&/\p{L}/u.test(e[t+1]))t++;else break;t<e.length&&r.has(e[t])&&t++;let i=e.slice(0,t);if(K(i)){yield{kind:`molecule`,text:i},e=e.slice(t);continue}yield{kind:`word`,text:i},e=e.slice(t);continue}yield{kind:`unknown`,text:e[0]},e=e.slice(1)}}function*fe(e,t){let n=(t?q(e):e).split(/\n\s*\n/),r=!0;for(let e of n){r||(yield``),r=!1;for(let t of e.split(/\s+/))t&&(yield t)}}function pe(e,t=!0){let n=[];for(let r of fe(e,t))for(let e of de(r))n.push(e);return n}const me={hr:`herra`,"hr.":`herra`,frú:`frú`,"frú.":`frú`,sr:`séra`,"sr.":`séra`,dr:`doktor`,"dr.":`doktor`,prof:`prófessor`,"prof.":`prófessor`,hf:`hlutafélag`,"hf.":`hlutafélag`,ehf:`einkahlutafélag`,"ehf.":`einkahlutafélag`,ohf:`opinbert hlutafélag`,"ohf.":`opinbert hlutafélag`,sf:`sameignarfélag`,"sf.":`sameignarfélag`,slf:`samlagsfélag`,"slf.":`samlagsfélag`,ses:`sjálfseignarstofnun`,"ses.":`sjálfseignarstofnun`,ofl:`og fleiri`,"o.fl.":`og fleiri`,osfrv:`og svo framvegis`,"o.s.frv.":`og svo framvegis`,oþh:`og þess háttar`,"o.þ.h.":`og þess háttar`,þe:`það er`,"þ.e.":`það er`,þea:`það er að segja`,"þ.e.a.s.":`það er að segja`,sbr:`samanber`,"sbr.":`samanber`,skv:`samkvæmt`,"skv.":`samkvæmt`,mtt:`með tilliti til`,"m.t.t.":`með tilliti til`,ath:`athugasemd`,"ath.":`athugasemd`,gr:`grein`,"gr.":`grein`,mgr:`málsgrein`,"mgr.":`málsgrein`,tölul:`töluliður`,"tölul.":`töluliður`,nr:`númer`,"nr.":`númer`,sl:`síðastliðinn`,"sl.":`síðastliðinn`,nk:`næstkomandi`,"n.k.":`næstkomandi`,kl:`klukkan`,"kl.":`klukkan`,ca:`circa`,"ca.":`circa`,bs:`Bachelor of Science`,"B.S.":`Bachelor of Science`,ms:`Master of Science`,"M.S.":`Master of Science`,ba:`Bachelor of Arts`,"B.A.":`Bachelor of Arts`,"M.A.":`Master of Arts`,phd:`Doctor of Philosophy`,"Ph.D.":`Doctor of Philosophy`,mba:`Master of Business Administration`,MBA:`Master of Business Administration`,Rvk:`Reykjavík`,"Rvk.":`Reykjavík`,Akr:`Akranes`,"Akr.":`Akranes`,Ak:`Akureyri`,"Ak.":`Akureyri`,n:`norður`,"n.":`norður`,s:`suður`,"s.":`suður`,a:`austur`,"a.":`austur`,v:`vestur`,"v.":`vestur`,na:`norðaustur`,"n.a.":`norðaustur`,nv:`norðvestur`,"n.v.":`norðvestur`,sa:`suðaustur`,"s.a.":`suðaustur`,sv:`suðvestur`,"s.v.":`suðvestur`,þús:`þúsund`,"þús.":`þúsund`,millj:`milljón`,"millj.":`milljón`,mljó:`milljón`,"mljó.":`milljón`,ma:`milljarður`,"ma.":`milljarður`,mrð:`milljarður`,"mrð.":`milljarður`},he=new Set([`o.fl`,`o.s.frv`,`o.þ.h`,`þ.e`,`þ.e.a.s`,`m.t.t`,`n.k`]);function ge(e){let t=[],n=0;for(;n<e.length;){let r=e[n],i=e[n+1];if(r.kind===`word`&&i?.kind===`punctuation`&&i.text===`.`){let e=r.text+`.`;if(e in me||e in S){t.push({kind:`word`,text:e}),n+=2;continue}}if(r.kind===`punctuation`&&r.text in b&&i?.kind===`number`){let e=b[r.text];t.push({kind:`amount`,text:r.text+i.text,value:i.value,currency:e}),n+=2;continue}if(r.kind===`number`&&i?.kind===`word`){let e=i.text;if(x.has(e)){t.push({kind:`amount`,text:r.text+` `+i.text,value:r.value,currency:e}),n+=2;continue}if(e in S){let a=S[e];t.push({kind:`amount`,text:r.text+` `+i.text,value:r.value*a,currency:`ISK`}),n+=2;continue}}if(r.kind===`number`&&i?.kind===`word`&&[`prósent`,`prósenta`,`prósenti`,`hundraðshluti`].includes(i.text.toLowerCase())){t.push({kind:`percent`,text:r.text+` `+i.text,value:r.value}),n+=2;continue}if((r.kind===`date`||r.kind===`dateabs`)&&i?.kind===`time`){t.push({kind:`timestamp`,text:r.text+` `+i.text,year:r.year,month:r.month,day:r.day,hour:i.hour,minute:i.minute,second:i.second}),n+=2;continue}t.push(r),n++}return t}function _e(e){if(e.kind===`s_end`||e.kind===`s_split`)return!0;if(e.kind===`word`&&e.text.length>0){let t=e.text[0];if(t===t.toUpperCase()&&t!==t.toLowerCase())return!(e.text.toLowerCase()in m||F(e.text)||x.has(e.text))}return!1}function ve(e){if(e.length===0)return[];let t=[],n=!1,r=0,i=()=>({kind:`s_begin`,text:null}),a=()=>({kind:`s_end`,text:null});for(;r<e.length;){let o=e[r],s=e[r+1];if(o.kind===`s_split`){n&&=(t.push(a()),!1),r++;continue}if(n||=(t.push(i()),!0),o.kind===`punctuation`&&l.has(o.normalized)){if(o.normalized===`…`&&s&&!_e(s)){t.push(o),r++;continue}let i=o.text,c=r+1;for(;c<e.length;){let t=e[c];if(t.kind!==`punctuation`||!d.has(t.normalized))break;i+=t.text,c++}for(c>r+1?(t.push({...o,text:i}),r=c):(t.push(o),r++);r<e.length;){let n=e[r];if(n.kind!==`punctuation`||!u.has(n.normalized))break;t.push(n),r++}t.push(a()),n=!1;continue}t.push(o),r++}return n&&t.push(a()),t}const ye=new Set([`-`,`–`]);function be(e,t=!1){return e.kind!==`word`||!t&&h.has(e.text)?null:m[e.text.toLowerCase()]??null}function xe(e){return e.kind===`punctuation`&&ye.has(e.text)}function Se(e,t){let n=[],r=t;for(;r<e.length;){let t=e[r],i=e[r+1];if(t?.kind!==`word`||!i||!xe(i))break;n.push(t),n.push(i),r+=2;let a=e[r];a?.kind===`punctuation`&&a.text===`,`&&(n.push(a),r++)}if(n.length===0)return null;let i=e[r];if(!i||i.kind!==`word`||i.text.toLowerCase()!==`og`&&i.text.toLowerCase()!==`eða`)return null;let a=e[r+1];if(!a||a.kind!==`word`)return null;let o=[...n,i,a].map(e=>e.text).join(` `);return o=o.replace(/ -/g,`-`).replace(/ ,/g,`,`),[{kind:`word`,text:o},r+2]}function Z(e){let t=[],n=0;for(;n<e.length;){let r=e[n],i=e[n+1],a=Se(e,n);if(a){t.push(a[0]),n=a[1];continue}if(r.kind===`word`&&i?.kind===`punctuation`&&i.text===`.`){let e=r.text.replace(/\.$/,``);if(he.has(e)){t.push({kind:`word`,text:r.text+`.`}),n+=2;continue}}if((r.kind===`year`||r.kind===`number`)&&i?.kind===`word`){let a=(r.kind,r.value),o=null;if(y.has(i.text)?o=-a:v.has(i.text)&&(o=a),o!==null){let a=r.text+` `+i.text;n+=2,e[n]?.kind===`punctuation`&&e[n].text===`.`&&(a+=`.`,n++),t.push({kind:`year`,text:a,value:o});continue}}if((r.kind===`ordinal`||r.kind===`number`)&&i?.kind===`word`){let e=be(i,!0);if(e!==null){let a=(r.kind,r.value);t.push({kind:`daterel`,text:r.text+` `+i.text,year:0,month:e,day:a}),n+=2;continue}}if((r.kind===`date`||r.kind===`daterel`)&&r.year===0&&i?.kind===`number`){let e=i.value;if(e>=1776&&e<=2100){t.push({kind:`dateabs`,text:r.text+` `+i.text,year:e,month:r.month,day:r.day}),n+=2;continue}}if((r.kind===`date`||r.kind===`daterel`)&&r.year===0&&i?.kind===`year`){t.push({kind:`dateabs`,text:r.text+` `+i.text,year:i.value,month:r.month,day:r.day}),n+=2;continue}if(r.kind===`word`&&_.has(r.text.toLowerCase())&&i?.kind===`time`){t.push({...i,text:r.text+` `+i.text}),n+=2;continue}if(r.kind===`word`&&_.has(r.text.toLowerCase())&&i?.kind===`word`){let e=ee[i.text.toLowerCase()];if(e){t.push({kind:`time`,text:r.text+` `+i.text,hour:e[0],minute:e[1],second:e[2]}),n+=2;continue}}t.push(r),n++}return t}function Q(e,t={}){let{replaceCompositeGlyphs:n=!0,includeSentenceMarkers:r=!1}=t,i=pe(e,n);return i=ge(i),i=Z(i),i=r?ve(i):i.filter(e=>e.kind!==`s_split`),i}function Ce(e){let t=Q(e,{includeSentenceMarkers:!0}),n=[],r=[];for(let e of t)e.kind===`s_begin`?r=[]:e.kind===`s_end`?(r.length>0&&n.push($(r)),r=[]):e.text!==null&&r.push(we(e));return r.length>0&&n.push($(r)),n}function we(e){return e.kind===`punctuation`?e.normalized:e.text??``}function $(e){if(e.length===0)return``;let t=e[0];for(let n=1;n<e.length;n++){let r=e[n-1],i=e[n];Te(r,i)?t+=` `+i:t+=i}return t}function Te(e,t){if(!e||!t)return!1;let n=e[e.length-1],r=t[0];return!(new Set([`(`,`[`,`„`,`‚`,`«`,`<`]).has(n)||new Set([`.`,`,`,`;`,`:`,`!`,`?`,`)`,`]`,`“`,`’`,`»`,`>`,`…`]).has(r)||n===`-`||r===`-`)}export{q as normalizeUnicode,Ce as splitIntoSentences,Q as tokenize};
2
+ //# sourceMappingURL=index.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.mjs","names":["COMPOSITE_HYPHENS","COMPOSITE_HYPHENS"],"sources":["../src/data/constants.ts","../src/data/units.ts","../src/data/patterns.ts","../src/pipeline/lexer.ts","../src/data/abbreviations.ts","../src/pipeline/particles.ts","../src/pipeline/sentences.ts","../src/pipeline/phrases.ts","../src/tokenize.ts","../src/split-sentences.ts"],"sourcesContent":["/**\n * Unicode replacements for composite glyphs\n */\nexport const UNICODE_REPLACEMENTS: Record<string, string> = {\n // Vowel + combining acute accent (U+0301)\n \"a\\u0301\": \"\\u00E1\", // á\n \"e\\u0301\": \"\\u00E9\", // é\n \"i\\u0301\": \"\\u00ED\", // í\n \"o\\u0301\": \"\\u00F3\", // ó\n \"u\\u0301\": \"\\u00FA\", // ú\n \"y\\u0301\": \"\\u00FD\", // ý\n \"A\\u0301\": \"\\u00C1\", // Á\n \"E\\u0301\": \"\\u00C9\", // É\n \"I\\u0301\": \"\\u00CD\", // Í\n \"O\\u0301\": \"\\u00D3\", // Ó\n \"U\\u0301\": \"\\u00DA\", // Ú\n \"Y\\u0301\": \"\\u00DD\", // Ý\n // Vowel + combining diaeresis (U+0308)\n \"a\\u0308\": \"\\u00E4\", // ä\n \"e\\u0308\": \"\\u00EB\", // ë\n \"o\\u0308\": \"\\u00F6\", // ö\n \"u\\u0308\": \"\\u00FC\", // ü\n \"A\\u0308\": \"\\u00C4\", // Ä\n \"E\\u0308\": \"\\u00CB\", // Ë\n \"O\\u0308\": \"\\u00D6\", // Ö\n \"U\\u0308\": \"\\u00DC\", // Ü\n // Remove unwanted characters\n \"\\u00AD\": \"\", // Soft hyphen\n \"\\u200B\": \"\", // Zero-width space\n \"\\uFEFF\": \"\", // Zero-width nbsp (BOM)\n};\n\n/**\n * Hyphen characters\n */\nexport const HYPHEN = \"-\";\nexport const EN_DASH = \"\\u2013\"; // –\nexport const EM_DASH = \"\\u2014\"; // —\nexport const HYPHENS = HYPHEN + EN_DASH + EM_DASH;\nexport const COMPOSITE_HYPHENS = HYPHEN + EN_DASH;\n\n/**\n * Punctuation character sets\n * Using Unicode escapes to avoid parsing issues\n */\n// Left: ( [ „ ‚ « # $ € £ ¥ ₽ <\nexport const LEFT_PUNCTUATION = \"([\\u201E\\u201A\\u00AB#$\\u20AC\\u00A3\\u00A5\\u20BD<\";\n// Right: . , : ; ) ] ! % ‰ ? » \" ' ‛ ' … > °\nexport const RIGHT_PUNCTUATION = \".,:;)]!%\\u2030?\\u00BB\\u201C\\u2019\\u201B\\u2018\\u2026>\\u00B0\";\n// Center: \" * • & + = @ © |\nexport const CENTER_PUNCTUATION = '\"*\\u2022&+=@\\u00A9|';\n// None: ^ / ± ' ´ ~ \\ -\nexport const NONE_PUNCTUATION = \"^/\\u00B1'\\u00B4~\\\\\" + HYPHENS;\nexport const PUNCTUATION =\n LEFT_PUNCTUATION + CENTER_PUNCTUATION + RIGHT_PUNCTUATION + NONE_PUNCTUATION;\n\nexport const PUNCTUATION_SET = new Set(PUNCTUATION);\nexport const LEFT_PUNCTUATION_SET = new Set(LEFT_PUNCTUATION);\nexport const RIGHT_PUNCTUATION_SET = new Set(RIGHT_PUNCTUATION);\nexport const NONE_PUNCTUATION_SET = new Set(NONE_PUNCTUATION);\n\n/**\n * Quote characters\n */\n// Single quotes: ' ‚ ‛ ' ´\nexport const SINGLE_QUOTES = \"'\\u201A\\u201B\\u2019\\u00B4\";\n// Double quotes: \" \" „ \" « »\nexport const DOUBLE_QUOTES = '\"\\u201C\\u201E\\u201D\\u00AB\\u00BB';\n\n/**\n * Normalized quote characters (for output)\n */\nexport const OPEN_DOUBLE_QUOTE = \"\\u201E\"; // „\nexport const CLOSE_DOUBLE_QUOTE = \"\\u201C\"; // \"\nexport const OPEN_SINGLE_QUOTE = \"\\u201A\"; // ‚\nexport const CLOSE_SINGLE_QUOTE = \"\\u2019\"; // '\n\n/**\n * Sentence-ending punctuation\n */\nexport const END_OF_SENTENCE = new Set([\".\", \"?\", \"!\", \"\\u2026\"]); // … = ellipsis\nexport const SENTENCE_FINISHERS = new Set([\n \")\",\n \"]\",\n \"\\u201C\", // \"\n \"\\u00BB\", // »\n \"\\u201D\", // \"\n \"\\u2019\", // '\n '\"',\n \"[\\u2026]\", // […]\n]);\n\n/**\n * Punctuation that may occur inside words\n */\nexport const PUNCT_INSIDE_WORD = new Set([\n \".\",\n \"'\",\n \"\\u2019\", // '\n \"\\u00B4\", // ´\n \"\\u2018\", // '\n HYPHEN,\n EN_DASH,\n]);\nexport const PUNCT_ENDING_WORD = new Set([\"'\", \"\\u00B2\", \"\\u00B3\"]); // ² ³\nexport const PUNCT_COMBINATIONS = new Set([\"?\", \"!\", \"\\u2026\"]); // …\n\n/**\n * Digit-related sets\n */\nexport const DIGITS = new Set(\"0123456789\");\nexport const SIGN_PREFIX = new Set([\"+\", \"-\"]);\n\n/**\n * Icelandic month names to month numbers\n */\nexport const MONTHS: Record<string, number> = {\n \"jan\\u00FAar\": 1, // janúar\n \"jan\\u00FAars\": 1, // janúars\n \"febr\\u00FAar\": 2, // febrúar\n \"febr\\u00FAars\": 2, // febrúars\n mars: 3,\n \"apr\\u00EDl\": 4, // apríl\n \"apr\\u00EDls\": 4, // apríls\n \"ma\\u00ED\": 5, // maí\n \"ma\\u00EDs\": 5, // maís\n \"j\\u00FAn\\u00ED\": 6, // júní\n \"j\\u00FAn\\u00EDs\": 6, // júnís\n \"j\\u00FAl\\u00ED\": 7, // júlí\n \"j\\u00FAl\\u00EDs\": 7, // júlís\n \"\\u00E1g\\u00FAst\": 8, // ágúst\n \"\\u00E1g\\u00FAsts\": 8, // ágústs\n september: 9,\n septembers: 9,\n \"okt\\u00F3ber\": 10, // október\n \"okt\\u00F3bers\": 10, // októbers\n \"n\\u00F3vember\": 11, // nóvember\n \"n\\u00F3vembers\": 11, // nóvembers\n desember: 12,\n desembers: 12,\n // Abbreviated forms\n \"jan.\": 1,\n \"feb.\": 2,\n \"mar.\": 3,\n \"apr.\": 4,\n \"j\\u00FAn.\": 6, // jún.\n \"j\\u00FAl.\": 7, // júl.\n \"\\u00E1g.\": 8, // ág.\n \"\\u00E1g\\u00FA.\": 8, // ágú.\n \"sep.\": 9,\n \"sept.\": 9,\n \"okt.\": 10,\n \"n\\u00F3v.\": 11, // nóv.\n \"des.\": 12,\n jan: 1,\n feb: 2,\n mar: 3,\n apr: 4,\n \"j\\u00FAn\": 6, // jún\n \"j\\u00FAl\": 7, // júl\n \"\\u00E1g\": 8, // ág\n \"\\u00E1g\\u00FA\": 8, // ágú\n sep: 9,\n sept: 9,\n okt: 10,\n \"n\\u00F3v\": 11, // nóv\n des: 12,\n};\n\n/**\n * Month name blacklist (Ágúst is also a masculine name)\n */\nexport const MONTH_BLACKLIST = new Set([\"\\u00C1g\\u00FAst\"]); // Ágúst\n\n/**\n * Max days in each month (index 0 unused, 1=January)\n */\nexport const DAYS_IN_MONTH = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31];\n\n/**\n * Clock abbreviations\n */\nexport const CLOCK_ABBREVS = new Set([\"kl\", \"kl.\", \"klukkan\"]);\n\n/**\n * Time expressions spelled out in Icelandic\n */\nexport const CLOCK_NUMBERS: Record<string, [number, number, number]> = {\n eitt: [1, 0, 0],\n \"tv\\u00F6\": [2, 0, 0], // tvö\n \"\\u00FErj\\u00FA\": [3, 0, 0], // þrjú\n \"fj\\u00F6gur\": [4, 0, 0], // fjögur\n fimm: [5, 0, 0],\n sex: [6, 0, 0],\n \"sj\\u00F6\": [7, 0, 0], // sjö\n \"\\u00E1tta\": [8, 0, 0], // átta\n \"n\\u00EDu\": [9, 0, 0], // níu\n \"t\\u00EDu\": [10, 0, 0], // tíu\n ellefu: [11, 0, 0],\n \"t\\u00F3lf\": [12, 0, 0], // tólf\n \"h\\u00E1lfeitt\": [12, 30, 0], // hálfeitt\n \"h\\u00E1lftv\\u00F6\": [1, 30, 0], // hálftvö\n \"h\\u00E1lf\\u00FErj\\u00FA\": [2, 30, 0], // hálfþrjú\n \"h\\u00E1lffj\\u00F6gur\": [3, 30, 0], // hálffjögur\n \"h\\u00E1lffimm\": [4, 30, 0], // hálffimm\n \"h\\u00E1lfsex\": [5, 30, 0], // hálfsex\n \"h\\u00E1lfsj\\u00F6\": [6, 30, 0], // hálfsjö\n \"h\\u00E1lf\\u00E1tta\": [7, 30, 0], // hálfátta\n \"h\\u00E1lfn\\u00EDu\": [8, 30, 0], // hálfníu\n \"h\\u00E1lft\\u00EDu\": [9, 30, 0], // hálftíu\n \"h\\u00E1lfellefu\": [10, 30, 0], // hálfellefu\n \"h\\u00E1lft\\u00F3lf\": [11, 30, 0], // hálftólf\n};\n\n/**\n * Before/After Common Era markers\n */\nexport const CE = new Set([\"e.Kr\", \"e.Kr.\"]);\nexport const BCE = new Set([\"f.Kr\", \"f.Kr.\"]);\nexport const CE_BCE = new Set([...CE, ...BCE]);\n\n/**\n * URL prefixes\n */\nexport const URL_PREFIXES = [\"http://\", \"https://\", \"ftp://\", \"file://\", \"mailto:\", \"www.\"];\n","/**\n * Currency symbols to ISO codes\n */\nexport const CURRENCY_SYMBOLS: Record<string, string> = {\n $: \"USD\",\n \"€\": \"EUR\",\n \"£\": \"GBP\",\n \"¥\": \"JPY\",\n \"₽\": \"RUB\",\n};\n\n/**\n * ISO 4217 currency codes\n */\nexport const CURRENCY_ABBREV = new Set([\n \"ISK\",\n \"DKK\",\n \"NOK\",\n \"SEK\",\n \"GBP\",\n \"USD\",\n \"EUR\",\n \"CAD\",\n \"AUD\",\n \"CHF\",\n \"JPY\",\n \"PLN\",\n \"RUB\",\n \"CZK\",\n \"INR\",\n \"CNY\",\n \"RMB\",\n \"HKD\",\n \"NZD\",\n \"SGD\",\n \"MXN\",\n \"ZAR\",\n]);\n\n/**\n * ISK amount abbreviations (króna-specific)\n */\nexport const AMOUNT_ABBREV: Record<string, number> = {\n \"kr.\": 1,\n kr: 1,\n krónur: 1,\n \"þ.kr.\": 1e3,\n \"þ.kr\": 1e3,\n \"þús.kr.\": 1e3,\n \"þús.kr\": 1e3,\n \"m.kr.\": 1e6,\n \"m.kr\": 1e6,\n \"mkr.\": 1e6,\n mkr: 1e6,\n \"millj.kr.\": 1e6,\n \"millj.kr\": 1e6,\n \"ma.kr.\": 1e9,\n \"ma.kr\": 1e9,\n \"mlja.kr.\": 1e9,\n \"mlja.kr\": 1e9,\n};\n\nexport const ISK_AMOUNT_PRECEDING = new Set([\"kr.\", \"kr\", \"krónur\"]);\n\n/**\n * SI units: unit → [base unit, conversion factor]\n * Conversion factor is number or null (for temperature that needs functions)\n */\nexport const SI_UNITS: Record<string, [string, number]> = {\n // Distance\n m: [\"m\", 1.0],\n mm: [\"m\", 1.0e-3],\n μm: [\"m\", 1.0e-6],\n cm: [\"m\", 1.0e-2],\n sm: [\"m\", 1.0e-2],\n km: [\"m\", 1.0e3],\n ft: [\"m\", 0.3048],\n mi: [\"m\", 1609.34],\n // Area\n \"m²\": [\"m²\", 1.0],\n fm: [\"m²\", 1.0],\n \"km²\": [\"m²\", 1.0e6],\n \"cm²\": [\"m²\", 1.0e-2],\n ha: [\"m²\", 1.0e4],\n // Volume\n \"m³\": [\"m³\", 1.0],\n \"cm³\": [\"m³\", 1.0e-6],\n \"km³\": [\"m³\", 1.0e9],\n l: [\"m³\", 1.0e-3],\n ltr: [\"m³\", 1.0e-3],\n dl: [\"m³\", 1.0e-4],\n cl: [\"m³\", 1.0e-5],\n ml: [\"m³\", 1.0e-6],\n gal: [\"m³\", 3.78541e-3],\n bbl: [\"m³\", 158.987294928e-3],\n // Temperature\n K: [\"K\", 1.0],\n \"°K\": [\"K\", 1.0],\n // Mass\n g: [\"kg\", 1.0e-3],\n gr: [\"kg\", 1.0e-3],\n kg: [\"kg\", 1.0],\n t: [\"kg\", 1.0e3],\n mg: [\"kg\", 1.0e-6],\n μg: [\"kg\", 1.0e-9],\n tn: [\"kg\", 1.0e3],\n lb: [\"kg\", 0.453592],\n // Duration\n s: [\"s\", 1.0],\n ms: [\"s\", 1.0e-3],\n μs: [\"s\", 1.0e-6],\n klst: [\"s\", 3600.0],\n mín: [\"s\", 60.0],\n // Force\n N: [\"N\", 1.0],\n kN: [\"N\", 1.0e3],\n // Energy\n Nm: [\"J\", 1.0],\n J: [\"J\", 1.0],\n kJ: [\"J\", 1.0e3],\n MJ: [\"J\", 1.0e6],\n GJ: [\"J\", 1.0e9],\n TJ: [\"J\", 1.0e12],\n kWh: [\"J\", 3.6e6],\n MWh: [\"J\", 3.6e9],\n kWst: [\"J\", 3.6e6],\n MWst: [\"J\", 3.6e9],\n kcal: [\"J\", 4184],\n cal: [\"J\", 4.184],\n // Power\n W: [\"W\", 1.0],\n mW: [\"W\", 1.0e-3],\n kW: [\"W\", 1.0e3],\n MW: [\"W\", 1.0e6],\n GW: [\"W\", 1.0e9],\n TW: [\"W\", 1.0e12],\n // Electric potential\n V: [\"V\", 1.0],\n mV: [\"V\", 1.0e-3],\n kV: [\"V\", 1.0e3],\n // Electric current\n A: [\"A\", 1.0],\n mA: [\"A\", 1.0e-3],\n // Frequency\n Hz: [\"Hz\", 1.0],\n kHz: [\"Hz\", 1.0e3],\n MHz: [\"Hz\", 1.0e6],\n GHz: [\"Hz\", 1.0e9],\n // Pressure\n Pa: [\"Pa\", 1.0],\n hPa: [\"Pa\", 1.0e2],\n // Angle\n \"°\": [\"°\", 1.0],\n // Percentage\n \"%\": [\"%\", 1.0],\n \"‰\": [\"‰\", 0.1],\n};\n\nexport const SI_UNITS_SET = new Set(Object.keys(SI_UNITS));\n\n/**\n * Build regex for SI units (sorted by length descending)\n */\nfunction buildUnitsRegex(): RegExp {\n const units = Object.keys(SI_UNITS).sort((a, b) => b.length - a.length);\n const patterns = units.map((unit) => {\n const escaped = unit.replace(/[.*+?^${}()|[\\]\\\\]/g, \"\\\\$&\");\n // If unit ends with letter, require word boundary\n return unit[unit.length - 1].match(/[a-zA-Z]/) ? `${escaped}(?!\\\\w)` : escaped;\n });\n return new RegExp(`^(${patterns.join(\"|\")})`, \"u\");\n}\n\nexport const SI_UNITS_REGEX = buildUnitsRegex();\n\n/**\n * Build regex for currency symbols\n */\nfunction buildCurrencyRegex(): RegExp {\n const symbols = Object.keys(CURRENCY_SYMBOLS).sort((a, b) => b.length - a.length);\n const patterns = symbols.map((s) => s.replace(/[.*+?^${}()|[\\]\\\\]/g, \"\\\\$&\"));\n return new RegExp(`^(${patterns.join(\"|\")})`, \"u\");\n}\n\nexport const CURRENCY_REGEX = buildCurrencyRegex();\n\n/**\n * Combined unit regex (SI + currency)\n */\nfunction buildCombinedUnitRegex(): RegExp {\n const allUnits = [...Object.keys(SI_UNITS), ...Object.keys(CURRENCY_SYMBOLS)].sort(\n (a, b) => b.length - a.length,\n );\n const patterns = allUnits.map((unit) => {\n const escaped = unit.replace(/[.*+?^${}()|[\\]\\\\]/g, \"\\\\$&\");\n return unit[unit.length - 1].match(/[a-zA-Z]/) ? `${escaped}(?!\\\\w)` : escaped;\n });\n return new RegExp(`(${patterns.join(\"|\")})$`);\n}\n\nexport const UNIT_REGEX = buildCombinedUnitRegex();\n","/**\n * Regular expression patterns for tokenization\n */\n\n// Time patterns\nexport const TIME_HMS_MS = /^(\\d{1,2}):(\\d{2}):(\\d{2}),(\\d{2})(?!\\d)/;\nexport const TIME_HMS = /^(\\d{1,2}):(\\d{2}):(\\d{2})(?!\\d)/;\nexport const TIME_HM = /^(\\d{1,2}):(\\d{2})(?!\\d)/;\n\n// Date patterns\nexport const DATE_ISO = /^(\\d{4})[-/](\\d{2})[-/](\\d{2})(?!\\d)/;\nexport const DATE_DMY = /^(\\d{1,2})[./-](\\d{1,2})[./-](\\d{2,4})(?!\\d)/;\nexport const DATE_DM = /^(\\d{2})\\.(\\d{2})(?!\\d)/;\nexport const DATE_MY = /^(\\d{2})[.-](\\d{4})(?!\\d)/;\n\n// Number patterns\n// Icelandic style: 1.234,56 (dot as thousands, comma as decimal)\nexport const NUMBER_ICELANDIC = /^[-+]?\\d+(\\.\\d{3})*(,\\d+)?(?!\\d)/;\n// English style: 1,234.56 (comma as thousands, dot as decimal)\nexport const NUMBER_ENGLISH = /^[-+]?\\d+(,\\d{3})*(\\.\\d+)?(?!\\d)/;\n// Simple integer\nexport const NUMBER_INTEGER = /^[-+]?\\d+(?!\\d)/;\n\n// Number followed by letter (e.g., 14b, 33C)\nexport const NUMBER_WITH_LETTER = /^(\\d+)([a-zA-Z])(?!\\w)/u;\n\n// Email pattern\nexport const EMAIL = /^[^@\\s]+@[^@\\s]+(\\.[^@\\s.,/:;\"()%#!?]+)+/;\n\n// URL detection\nexport const URL_PREFIX = /^(https?:\\/\\/|ftp:\\/\\/|file:\\/\\/|mailto:|www\\.)/i;\n\n// Domain pattern (simplified)\nexport const DOMAIN = /^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\\.[a-zA-Z]{2,})+/;\n\n// Hashtag\nexport const HASHTAG = /^#\\w+/u;\n\n// Username (@handle)\nexport const USERNAME = /^@[0-9a-z_]+/i;\n\n// Roman numerals\nexport const ROMAN_NUMERAL = /^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$/;\n\n// Unicode vulgar fractions\nexport const VULGAR_FRACTIONS = /^[\\u00BC-\\u00BE\\u2150-\\u215E]/;\n\n// Ordinal suffixes (Icelandic kludgy ordinals like \"1sti\", \"3ji\")\nexport const KLUDGY_ORDINAL = /^(1st[iau]|3j[iau]|4ð[iau]|5t[iau]|2svar|3svar|2ja|3ja|4ra)(?!\\w)/;\n\n/**\n * Map of kludgy ordinals to their correct forms\n */\nexport const ORDINAL_CORRECTIONS: Record<string, string> = {\n \"1sti\": \"fyrsti\",\n \"1sta\": \"fyrsta\",\n \"1stu\": \"fyrstu\",\n \"3ji\": \"þriðji\",\n \"3ju\": \"þriðju\",\n \"4ði\": \"fjórði\",\n \"4ða\": \"fjórða\",\n \"4ðu\": \"fjórðu\",\n \"5ti\": \"fimmti\",\n \"5ta\": \"fimmta\",\n \"5tu\": \"fimmtu\",\n \"2svar\": \"tvisvar\",\n \"3svar\": \"þrisvar\",\n \"2ja\": \"tveggja\",\n \"3ja\": \"þriggja\",\n \"4ra\": \"fjögurra\",\n};\n\n/**\n * Ordinals that can be converted to numbers\n */\nexport const ORDINAL_NUMBERS: Record<string, number> = {\n \"1sti\": 1,\n \"1sta\": 1,\n \"1stu\": 1,\n \"3ji\": 3,\n \"3ja\": 3,\n \"3ju\": 3,\n \"4ði\": 4,\n \"4ða\": 4,\n \"4ðu\": 4,\n \"5ti\": 5,\n \"5ta\": 5,\n \"5tu\": 5,\n};\n\n/**\n * Convert Roman numeral to integer\n */\nexport function romanToInt(s: string): number {\n const values: [number, string][] = [\n [1000, \"M\"],\n [900, \"CM\"],\n [500, \"D\"],\n [400, \"CD\"],\n [100, \"C\"],\n [90, \"XC\"],\n [50, \"L\"],\n [40, \"XL\"],\n [10, \"X\"],\n [9, \"IX\"],\n [5, \"V\"],\n [4, \"IV\"],\n [1, \"I\"],\n ];\n\n let i = 0;\n let result = 0;\n for (const [value, numeral] of values) {\n while (s.substring(i, i + numeral.length) === numeral) {\n result += value;\n i += numeral.length;\n }\n }\n return result;\n}\n\n/**\n * Check if string is a valid Roman numeral\n */\nexport function isRomanNumeral(s: string): boolean {\n return ROMAN_NUMERAL.test(s);\n}\n\n// Telephone number patterns\n// Icelandic: 7 digits, optionally with country code +354\n// Format: +XXX XXXXXXX or +XXXXXXXXXXX (10 digits total for Iceland)\nexport const TELNO_WITH_CC = /^\\+(\\d{1,3})[-\\s]?(\\d{3})[-\\s]?(\\d{4})(?!\\d)/;\n\n// Chemical molecule pattern\n// Matches element symbols (uppercase + optional lowercase) followed by optional digit count\n// Examples: H2O, CO2, NaCl, H2SO4, C6H12O6\n// Must have at least 2 element symbols (single element like \"O\" or \"O2\" is not a molecule)\nexport const MOLECULE = /^[A-Z][a-z]?\\d*(?:[A-Z][a-z]?\\d*)+$/;\n\n// Icelandic SSN (kennitala) pattern: DDMMYY-XXXX\n// Last digit is century indicator (9=1900s, 0=2000s)\nexport const SSN = /^(\\d{6})-?(\\d{4})(?!\\d)/;\n\n// Serial number pattern: groups of digits separated by hyphens\n// Examples: 394-8362, 123-456-789\nexport const SERIAL_NUMBER = /^(\\d+)-(\\d+)(?:-\\d+)*(?!\\d)/;\n\n// Timestamp patterns (date + time combined)\n// ISO timestamp: YYYY-MM-DD HH:MM:SS or YYYY-MM-DDTHH:MM:SS\nexport const TIMESTAMP_ISO = /^(\\d{4})[-/](\\d{2})[-/](\\d{2})[T\\s](\\d{1,2}):(\\d{2}):(\\d{2})(?!\\d)/;\nexport const TIMESTAMP_ISO_HM = /^(\\d{4})[-/](\\d{2})[-/](\\d{2})[T\\s](\\d{1,2}):(\\d{2})(?!\\d)/;\n\n// Company/Person/Entity markers (for annotated text with brackets)\n// Format: [company:Name] or [company Name] (without spaces in single-token form)\nexport const COMPANY_MARKER = /^\\[company[:\\s]([^\\]]+)\\]/i;\nexport const PERSON_MARKER = /^\\[person[:\\s]([^\\]]+)\\]/i;\nexport const ENTITY_MARKER = /^\\[entity[:\\s]([^\\]]+)\\]/i;\n\n/**\n * Valid chemical element symbols\n */\nexport const ELEMENT_SYMBOLS = new Set([\n \"H\",\n \"He\",\n \"Li\",\n \"Be\",\n \"B\",\n \"C\",\n \"N\",\n \"O\",\n \"F\",\n \"Ne\",\n \"Na\",\n \"Mg\",\n \"Al\",\n \"Si\",\n \"P\",\n \"S\",\n \"Cl\",\n \"Ar\",\n \"K\",\n \"Ca\",\n \"Sc\",\n \"Ti\",\n \"V\",\n \"Cr\",\n \"Mn\",\n \"Fe\",\n \"Co\",\n \"Ni\",\n \"Cu\",\n \"Zn\",\n \"Ga\",\n \"Ge\",\n \"As\",\n \"Se\",\n \"Br\",\n \"Kr\",\n \"Rb\",\n \"Sr\",\n \"Y\",\n \"Zr\",\n \"Nb\",\n \"Mo\",\n \"Tc\",\n \"Ru\",\n \"Rh\",\n \"Pd\",\n \"Ag\",\n \"Cd\",\n \"In\",\n \"Sn\",\n \"Sb\",\n \"Te\",\n \"I\",\n \"Xe\",\n \"Cs\",\n \"Ba\",\n \"La\",\n \"Ce\",\n \"Pr\",\n \"Nd\",\n \"Pm\",\n \"Sm\",\n \"Eu\",\n \"Gd\",\n \"Tb\",\n \"Dy\",\n \"Ho\",\n \"Er\",\n \"Tm\",\n \"Yb\",\n \"Lu\",\n \"Hf\",\n \"Ta\",\n \"W\",\n \"Re\",\n \"Os\",\n \"Ir\",\n \"Pt\",\n \"Au\",\n \"Hg\",\n \"Tl\",\n \"Pb\",\n \"Bi\",\n \"Po\",\n \"At\",\n \"Rn\",\n \"Fr\",\n \"Ra\",\n \"Ac\",\n \"Th\",\n \"Pa\",\n \"U\",\n \"Np\",\n \"Pu\",\n \"Am\",\n \"Cm\",\n \"Bk\",\n \"Cf\",\n \"Es\",\n \"Fm\",\n \"Md\",\n \"No\",\n \"Lr\",\n \"Rf\",\n \"Db\",\n \"Sg\",\n \"Bh\",\n \"Hs\",\n \"Mt\",\n \"Ds\",\n \"Rg\",\n \"Cn\",\n \"Nh\",\n \"Fl\",\n \"Mc\",\n \"Lv\",\n \"Ts\",\n \"Og\",\n]);\n\n/**\n * Validate Icelandic kennitala checksum\n * Returns true if valid, false otherwise\n */\nexport function validateKennitala(digits: string): boolean {\n if (digits.length !== 10) return false;\n\n // All characters must be digits\n if (!/^\\d{10}$/.test(digits)) return false;\n\n // Extract components\n const d1 = parseInt(digits[0], 10);\n const d2 = parseInt(digits[1], 10);\n const d3 = parseInt(digits[2], 10);\n const d4 = parseInt(digits[3], 10);\n const d5 = parseInt(digits[4], 10);\n const d6 = parseInt(digits[5], 10);\n const d7 = parseInt(digits[6], 10);\n const d8 = parseInt(digits[7], 10);\n const checkDigit = parseInt(digits[8], 10);\n const century = parseInt(digits[9], 10);\n\n // Century must be 9 (1900s) or 0 (2000s)\n if (century !== 9 && century !== 0) return false;\n\n // Validate date components (DD MM YY)\n const day = d1 * 10 + d2;\n const month = d3 * 10 + d4;\n\n // Day must be 1-31 (or 41-71 for companies: day + 40)\n const isCompany = day > 40;\n const actualDay = isCompany ? day - 40 : day;\n if (actualDay < 1 || actualDay > 31) return false;\n if (month < 1 || month > 12) return false;\n\n // Calculate checksum: 11 - ((3×d1 + 2×d2 + 7×d3 + 6×d4 + 5×d5 + 4×d6 + 3×d7 + 2×d8) mod 11)\n const sum = 3 * d1 + 2 * d2 + 7 * d3 + 6 * d4 + 5 * d5 + 4 * d6 + 3 * d7 + 2 * d8;\n const remainder = sum % 11;\n const expected = remainder === 0 ? 0 : 11 - remainder;\n\n // If expected is 10, the kennitala is invalid\n if (expected === 10) return false;\n\n return checkDigit === expected;\n}\n\n/**\n * Check if a string is a valid chemical formula\n * Validates that all symbols are real element symbols\n * Requires at least 2 elements (otherwise it's just an element symbol, not a molecule)\n */\nexport function isValidMolecule(s: string): boolean {\n // Must match the molecule pattern\n if (!MOLECULE.test(s)) return false;\n\n // Extract and validate all element symbols\n const elementRegex = /([A-Z][a-z]?)(\\d*)/g;\n let match;\n let elementCount = 0;\n\n while ((match = elementRegex.exec(s)) !== null) {\n const element = match[1];\n if (!ELEMENT_SYMBOLS.has(element)) {\n return false;\n }\n elementCount++;\n }\n\n // Must have at least 2 element symbols to be a molecule\n return elementCount >= 2;\n}\n","/**\n * Lexer: Split text into initial tokens\n *\n * This is the first stage of the pipeline. It:\n * 1. Splits text on whitespace\n * 2. Extracts punctuation\n * 3. Classifies tokens (words, numbers, dates, times, etc.)\n */\n\nimport type { Token, PunctuationType } from \"../types.js\";\nimport {\n PUNCTUATION_SET,\n LEFT_PUNCTUATION_SET,\n RIGHT_PUNCTUATION_SET,\n NONE_PUNCTUATION_SET,\n SINGLE_QUOTES,\n DOUBLE_QUOTES,\n HYPHENS,\n HYPHEN,\n COMPOSITE_HYPHENS,\n DIGITS,\n SIGN_PREFIX,\n DAYS_IN_MONTH,\n UNICODE_REPLACEMENTS,\n OPEN_DOUBLE_QUOTE,\n CLOSE_DOUBLE_QUOTE,\n OPEN_SINGLE_QUOTE,\n CLOSE_SINGLE_QUOTE,\n} from \"../data/constants.js\";\nimport { SI_UNITS, SI_UNITS_SET, SI_UNITS_REGEX, CURRENCY_SYMBOLS } from \"../data/units.js\";\nimport {\n TIME_HMS_MS,\n TIME_HMS,\n TIME_HM,\n DATE_ISO,\n DATE_DMY,\n DATE_DM,\n DATE_MY,\n NUMBER_WITH_LETTER,\n EMAIL,\n URL_PREFIX,\n DOMAIN,\n HASHTAG,\n USERNAME,\n TELNO_WITH_CC,\n SSN,\n SERIAL_NUMBER,\n TIMESTAMP_ISO,\n TIMESTAMP_ISO_HM,\n COMPANY_MARKER,\n PERSON_MARKER,\n ENTITY_MARKER,\n validateKennitala,\n isValidMolecule,\n} from \"../data/patterns.js\";\n\n/**\n * Replace composite Unicode glyphs with single characters\n */\nexport function normalizeUnicode(text: string): string {\n let result = text;\n for (const [from, to] of Object.entries(UNICODE_REPLACEMENTS)) {\n result = result.replaceAll(from, to);\n }\n return result;\n}\n\n/**\n * Check if a date is valid\n */\nfunction isValidDate(y: number, m: number, d: number): boolean {\n if (y < 1776 || y > 2100) return false;\n if (m < 1 || m > 12) return false;\n if (d < 1 || d > DAYS_IN_MONTH[m]) return false;\n // Check Feb 29 in non-leap years\n if (m === 2 && d === 29) {\n const isLeap = (y % 4 === 0 && y % 100 !== 0) || y % 400 === 0;\n if (!isLeap) return false;\n }\n return true;\n}\n\n/**\n * Get punctuation position type\n */\nfunction getPunctType(char: string): PunctuationType {\n if (LEFT_PUNCTUATION_SET.has(char)) return \"left\";\n if (RIGHT_PUNCTUATION_SET.has(char)) return \"right\";\n if (NONE_PUNCTUATION_SET.has(char)) return \"none\";\n return \"center\";\n}\n\n/**\n * Create a punctuation token\n */\nfunction punct(text: string, normalized?: string): Token {\n const norm = normalized ?? text;\n const position = norm.length === 1 ? getPunctType(norm) : \"center\";\n return { kind: \"punctuation\", text, normalized: norm, position };\n}\n\n/**\n * Parse a token starting with digits\n */\nfunction parseDigits(w: string): [Token, number] {\n // Timestamp with full time: YYYY-MM-DD HH:MM:SS or YYYY-MM-DDTHH:MM:SS\n let match = TIMESTAMP_ISO.exec(w);\n if (match) {\n const y = parseInt(match[1], 10);\n const mo = parseInt(match[2], 10);\n const d = parseInt(match[3], 10);\n const h = parseInt(match[4], 10);\n const mi = parseInt(match[5], 10);\n const s = parseInt(match[6], 10);\n if (isValidDate(y, mo, d) && h >= 0 && h < 24 && mi >= 0 && mi < 60 && s >= 0 && s < 60) {\n return [\n {\n kind: \"timestamp\",\n text: match[0],\n year: y,\n month: mo,\n day: d,\n hour: h,\n minute: mi,\n second: s,\n },\n match[0].length,\n ];\n }\n }\n\n // Timestamp with HH:MM only: YYYY-MM-DD HH:MM or YYYY-MM-DDTHH:MM\n match = TIMESTAMP_ISO_HM.exec(w);\n if (match) {\n const y = parseInt(match[1], 10);\n const mo = parseInt(match[2], 10);\n const d = parseInt(match[3], 10);\n const h = parseInt(match[4], 10);\n const mi = parseInt(match[5], 10);\n if (isValidDate(y, mo, d) && h >= 0 && h < 24 && mi >= 0 && mi < 60) {\n return [\n {\n kind: \"timestamp\",\n text: match[0],\n year: y,\n month: mo,\n day: d,\n hour: h,\n minute: mi,\n second: 0,\n },\n match[0].length,\n ];\n }\n }\n\n // Time with milliseconds: H:M:S,MS\n match = TIME_HMS_MS.exec(w);\n if (match) {\n const h = parseInt(match[1], 10);\n const m = parseInt(match[2], 10);\n const s = parseInt(match[3], 10);\n if (h >= 0 && h < 24 && m >= 0 && m < 60 && s >= 0 && s < 60) {\n return [{ kind: \"time\", text: match[0], hour: h, minute: m, second: s }, match[0].length];\n }\n }\n\n // Time H:M:S\n match = TIME_HMS.exec(w);\n if (match) {\n const h = parseInt(match[1], 10);\n const m = parseInt(match[2], 10);\n const s = parseInt(match[3], 10);\n if (h >= 0 && h < 24 && m >= 0 && m < 60 && s >= 0 && s < 60) {\n return [{ kind: \"time\", text: match[0], hour: h, minute: m, second: s }, match[0].length];\n }\n }\n\n // Time H:M\n match = TIME_HM.exec(w);\n if (match) {\n const h = parseInt(match[1], 10);\n const m = parseInt(match[2], 10);\n if (h >= 0 && h < 24 && m >= 0 && m < 60) {\n return [{ kind: \"time\", text: match[0], hour: h, minute: m, second: 0 }, match[0].length];\n }\n }\n\n // ISO date: YYYY-MM-DD or YYYY/MM/DD\n match = DATE_ISO.exec(w);\n if (match) {\n const y = parseInt(match[1], 10);\n const m = parseInt(match[2], 10);\n const d = parseInt(match[3], 10);\n if (isValidDate(y, m, d)) {\n return [{ kind: \"date\", text: match[0], year: y, month: m, day: d }, match[0].length];\n }\n }\n\n // Icelandic SSN (kennitala): DDMMYY-XXXX\n match = SSN.exec(w);\n if (match) {\n const digits = match[1] + match[2];\n if (validateKennitala(digits)) {\n return [{ kind: \"ssn\", text: match[0], value: digits }, match[0].length];\n }\n }\n\n // Serial number: XXX-XXXX or similar patterns with hyphens\n // This includes invalid SSN-like patterns (6-4 digits that failed checksum validation)\n match = SERIAL_NUMBER.exec(w);\n if (match) {\n return [{ kind: \"serialnumber\", text: match[0] }, match[0].length];\n }\n\n // Icelandic phone number (7 digits without hyphen): XXXXXXX\n // Only match continuous 7 digits - hyphenated patterns go to serial number\n const telnoMatch = w.match(/^(\\d{7})(?!\\d)/);\n if (telnoMatch) {\n const number = telnoMatch[1];\n return [{ kind: \"telno\", text: telnoMatch[0], cc: \"\", number }, telnoMatch[0].length];\n }\n\n // Date with day, month, year: D.M.Y or D/M/Y or D-M-Y\n match = DATE_DMY.exec(w);\n if (match) {\n let d = parseInt(match[1], 10);\n let m = parseInt(match[2], 10);\n let y = parseInt(match[3], 10);\n // Handle 2-digit years\n if (y <= 99) {\n y += y > 50 ? 1900 : 2000;\n }\n // Swap if American format (month > 12 but day <= 12)\n if (m > 12 && d <= 12) {\n [d, m] = [m, d];\n }\n if (isValidDate(y, m, d)) {\n return [{ kind: \"date\", text: match[0], year: y, month: m, day: d }, match[0].length];\n }\n }\n\n // Relative date: DD.MM (day and month only)\n match = DATE_DM.exec(w);\n if (match) {\n const d = parseInt(match[1], 10);\n const m = parseInt(match[2], 10);\n if (m >= 1 && m <= 12 && d >= 1 && d <= DAYS_IN_MONTH[m]) {\n return [{ kind: \"daterel\", text: match[0], year: 0, month: m, day: d }, match[0].length];\n }\n }\n\n // Relative date: MM.YYYY or MM-YYYY\n match = DATE_MY.exec(w);\n if (match) {\n const m = parseInt(match[1], 10);\n const y = parseInt(match[2], 10);\n if (y >= 1776 && y <= 2100 && m >= 1 && m <= 12) {\n return [{ kind: \"daterel\", text: match[0], year: y, month: m, day: 0 }, match[0].length];\n }\n }\n\n // Number with trailing letter: 14b, 33C\n match = NUMBER_WITH_LETTER.exec(w);\n if (match) {\n const letter = match[2];\n // Don't match if the letter is an SI unit\n if (!SI_UNITS_SET.has(letter)) {\n const n = parseInt(match[1], 10);\n return [{ kind: \"numwletter\", text: match[0], value: n, letter }, match[0].length];\n }\n }\n\n // Number with unit (Icelandic style: 1.234,56km)\n const icelandicMatch = w.match(/^([-+]?\\d+(?:\\.\\d{3})*(?:,\\d+)?)/);\n if (icelandicMatch) {\n const numPart = icelandicMatch[1];\n const rest = w.slice(numPart.length);\n const unitMatch = SI_UNITS_REGEX.exec(rest);\n if (unitMatch) {\n const unit = unitMatch[1];\n const fullText = numPart + unit;\n const value = parseFloat(numPart.replace(/\\./g, \"\").replace(\",\", \".\"));\n if (unit in CURRENCY_SYMBOLS) {\n const iso = CURRENCY_SYMBOLS[unit];\n return [{ kind: \"amount\", text: fullText, value, currency: iso }, fullText.length];\n }\n const [baseUnit] = SI_UNITS[unit];\n if (unit === \"%\" || unit === \"‰\") {\n return [{ kind: \"percent\", text: fullText, value }, fullText.length];\n }\n return [{ kind: \"measurement\", text: fullText, value, unit: baseUnit }, fullText.length];\n }\n }\n\n // Plain number (try Icelandic style first, then English)\n // Icelandic: 1.234,56\n const iceNum = w.match(/^([-+]?\\d+(?:\\.\\d{3})*(?:,\\d+)?)(?!\\d)/);\n if (iceNum && iceNum[1].includes(\",\")) {\n const value = parseFloat(iceNum[1].replace(/\\./g, \"\").replace(\",\", \".\"));\n return [{ kind: \"number\", text: iceNum[1], value }, iceNum[1].length];\n }\n\n // English: 1,234.56\n const engNum = w.match(/^([-+]?\\d+(?:,\\d{3})*(?:\\.\\d+)?)(?!\\d)/);\n if (engNum && (engNum[1].includes(\",\") || engNum[1].includes(\".\"))) {\n const value = parseFloat(engNum[1].replace(/,/g, \"\"));\n return [{ kind: \"number\", text: engNum[1], value }, engNum[1].length];\n }\n\n // Simple integer\n const intMatch = w.match(/^([-+]?\\d+)(?!\\d)/);\n if (intMatch) {\n const value = parseInt(intMatch[1], 10);\n return [{ kind: \"number\", text: intMatch[1], value }, intMatch[1].length];\n }\n\n // Fallback: unknown\n return [{ kind: \"unknown\", text: w[0] }, 1];\n}\n\n/**\n * Parse a single whitespace-separated token\n */\nfunction* parseRawToken(w: string): Generator<Token> {\n // Empty string signals sentence split\n if (!w) {\n yield { kind: \"s_split\", text: null };\n return;\n }\n\n // Pure alphabetic word (most common case)\n if (/^[\\p{L}]+$/u.test(w) || SI_UNITS_SET.has(w)) {\n // Check if it's a chemical molecule (e.g., NaCl, CaCO3)\n if (isValidMolecule(w)) {\n yield { kind: \"molecule\", text: w };\n return;\n }\n yield { kind: \"word\", text: w };\n return;\n }\n\n // Phone number with country code: +XXX XXXXXXX (before signed number handling)\n if (w.startsWith(\"+\") && w.length >= 10) {\n const telMatch = TELNO_WITH_CC.exec(w);\n if (telMatch) {\n const cc = telMatch[1];\n const number = telMatch[2] + telMatch[3];\n yield { kind: \"telno\", text: telMatch[0], cc, number };\n w = w.slice(telMatch[0].length);\n if (!w) return;\n }\n }\n\n // Handle signed numbers at start\n if (w.length > 1 && SIGN_PREFIX.has(w[0]) && DIGITS.has(w[1])) {\n const [token, eaten] = parseDigits(w);\n yield token;\n w = w.slice(eaten);\n if (!w) return;\n }\n\n // Handle composite hyphen prefix: -menn in \"þingkonur og -menn\"\n if (w.length > 1 && COMPOSITE_HYPHENS.includes(w[0]) && /\\p{L}/u.test(w[1])) {\n let i = 2;\n while (i < w.length && /\\p{L}/u.test(w[i])) i++;\n const word = w.slice(0, i);\n if (\n word.slice(1).toLowerCase() === word.slice(1) ||\n (i > 2 && word.slice(1).toUpperCase() === word.slice(1))\n ) {\n yield { kind: \"word\", text: word };\n w = w.slice(i);\n }\n }\n\n // Shortcut for quoted single words: \"word\" or 'word'\n if (w.length >= 3) {\n if (DOUBLE_QUOTES.includes(w[0]) && DOUBLE_QUOTES.includes(w[w.length - 1])) {\n const inner = w.slice(1, -1);\n if (/^[\\p{L}]+$/u.test(inner)) {\n yield punct(w[0], OPEN_DOUBLE_QUOTE);\n yield { kind: \"word\", text: inner };\n yield punct(w[w.length - 1], CLOSE_DOUBLE_QUOTE);\n return;\n }\n }\n if (SINGLE_QUOTES.includes(w[0]) && SINGLE_QUOTES.includes(w[w.length - 1])) {\n const inner = w.slice(1, -1);\n if (/^[\\p{L}]+$/u.test(inner)) {\n yield punct(w[0], OPEN_SINGLE_QUOTE);\n yield { kind: \"word\", text: inner };\n yield punct(w[w.length - 1], CLOSE_SINGLE_QUOTE);\n return;\n }\n }\n }\n\n // Leading quote → opening quote\n if (w.length > 1) {\n if (DOUBLE_QUOTES.includes(w[0])) {\n yield punct(w[0], OPEN_DOUBLE_QUOTE);\n w = w.slice(1);\n } else if (SINGLE_QUOTES.includes(w[0])) {\n yield punct(w[0], OPEN_SINGLE_QUOTE);\n w = w.slice(1);\n }\n }\n\n // Process remaining characters\n while (w) {\n // Handle leading punctuation\n while (w && PUNCTUATION_SET.has(w[0])) {\n // Company/Person/Entity markers - check before other punctuation\n if (w.startsWith(\"[\")) {\n const companyMatch = COMPANY_MARKER.exec(w);\n if (companyMatch) {\n yield { kind: \"company\", text: companyMatch[1] };\n w = w.slice(companyMatch[0].length);\n continue;\n }\n const personMatch = PERSON_MARKER.exec(w);\n if (personMatch) {\n yield { kind: \"person\", text: personMatch[1] };\n w = w.slice(personMatch[0].length);\n continue;\n }\n const entityMatch = ENTITY_MARKER.exec(w);\n if (entityMatch) {\n yield { kind: \"entity\", text: entityMatch[1] };\n w = w.slice(entityMatch[0].length);\n continue;\n }\n }\n // Ellipsis variations\n if (w.startsWith(\"[...]\")) {\n yield punct(\"[...]\", \"[…]\");\n w = w.slice(5);\n continue;\n }\n if (w.startsWith(\"[…]\")) {\n yield punct(\"[…]\");\n w = w.slice(3);\n continue;\n }\n if (w.startsWith(\"...\")) {\n let dots = \"...\";\n let rest = w.slice(3);\n while (rest.startsWith(\".\")) {\n dots += \".\";\n rest = rest.slice(1);\n }\n yield punct(dots, \"…\");\n w = rest;\n continue;\n }\n if (w.startsWith(\"…\")) {\n yield punct(\"…\");\n w = w.slice(1);\n continue;\n }\n // Double comma → single comma or opening quote\n if (w === \",,\") {\n yield punct(\",,\", \",\");\n w = \"\";\n continue;\n }\n if (w.startsWith(\",,\")) {\n yield punct(\",,\", OPEN_DOUBLE_QUOTE);\n w = w.slice(2);\n continue;\n }\n // Paragraph markers\n if (w === \"[[\" || w === \"]]\") {\n // Skip paragraph markers for now (just punctuation)\n yield punct(w);\n w = \"\";\n continue;\n }\n // Hyphens\n if (HYPHENS.includes(w[0])) {\n yield punct(w[0], HYPHEN);\n w = w.slice(1);\n continue;\n }\n // Closing quotes\n if (DOUBLE_QUOTES.includes(w[0])) {\n yield punct(w[0], CLOSE_DOUBLE_QUOTE);\n w = w.slice(1);\n continue;\n }\n if (SINGLE_QUOTES.includes(w[0])) {\n yield punct(w[0], CLOSE_SINGLE_QUOTE);\n w = w.slice(1);\n continue;\n }\n // Hashtag check\n if (w.startsWith(\"#\") && w.length > 1) {\n const hashMatch = HASHTAG.exec(w);\n if (hashMatch) {\n // Check if it's a number sign: #12\n if (/^#\\d+$/.test(hashMatch[0])) {\n yield {\n kind: \"ordinal\",\n text: hashMatch[0],\n value: parseInt(hashMatch[0].slice(1), 10),\n };\n } else {\n yield { kind: \"hashtag\", text: hashMatch[0] };\n }\n w = w.slice(hashMatch[0].length);\n continue;\n }\n }\n // Username check\n if (w.startsWith(\"@\") && w.length > 1) {\n const userMatch = USERNAME.exec(w);\n if (userMatch) {\n yield { kind: \"username\", text: userMatch[0], username: userMatch[0].slice(1) };\n w = w.slice(userMatch[0].length);\n continue;\n }\n }\n // Phone number with country code: +XXX XXXXXXX\n if (w.startsWith(\"+\") && w.length > 1 && DIGITS.has(w[1])) {\n const telMatch = TELNO_WITH_CC.exec(w);\n if (telMatch) {\n const cc = telMatch[1];\n const number = telMatch[2] + telMatch[3];\n yield { kind: \"telno\", text: telMatch[0], cc, number };\n w = w.slice(telMatch[0].length);\n continue;\n }\n }\n // Default: single punctuation character\n yield punct(w[0]);\n w = w.slice(1);\n }\n\n if (!w) break;\n\n // Email check\n if (w.includes(\"@\")) {\n const emailMatch = EMAIL.exec(w);\n if (emailMatch) {\n yield { kind: \"email\", text: emailMatch[0] };\n w = w.slice(emailMatch[0].length);\n continue;\n }\n }\n\n // URL check\n if (URL_PREFIX.test(w)) {\n // Cut trailing punctuation\n let url = w;\n let trailing = \"\";\n while (url && RIGHT_PUNCTUATION_SET.has(url[url.length - 1])) {\n trailing = url[url.length - 1] + trailing;\n url = url.slice(0, -1);\n }\n yield { kind: \"url\", text: url };\n w = trailing;\n continue;\n }\n\n // Domain check\n if (w.length >= 4 && /^[a-zA-Z0-9]/.test(w) && w.includes(\".\")) {\n const domainMatch = DOMAIN.exec(w);\n if (domainMatch) {\n let domain = domainMatch[0];\n let trailing = w.slice(domain.length);\n // Cut trailing punctuation from domain\n while (domain && PUNCTUATION_SET.has(domain[domain.length - 1])) {\n trailing = domain[domain.length - 1] + trailing;\n domain = domain.slice(0, -1);\n }\n if (domain.includes(\".\")) {\n yield { kind: \"domain\", text: domain };\n w = trailing;\n continue;\n }\n }\n }\n\n // Numbers\n if (DIGITS.has(w[0]) || (SIGN_PREFIX.has(w[0]) && w.length > 1 && DIGITS.has(w[1]))) {\n const [token, eaten] = parseDigits(w);\n yield token;\n w = w.slice(eaten);\n\n // Check for SI unit immediately following\n if (w) {\n const unitMatch = SI_UNITS_REGEX.exec(w);\n if (unitMatch) {\n yield { kind: \"word\", text: unitMatch[1] };\n w = w.slice(unitMatch[1].length);\n }\n }\n continue;\n }\n\n // Words (alphabetic sequences)\n if (/^\\p{L}/u.test(w)) {\n let i = 1;\n const PUNCT_INSIDE = new Set([\".\", \"'\", \"'\", \"´\", \"'\", HYPHEN, \"\\u2013\"]);\n const PUNCT_ENDING = new Set([\"'\", \"²\", \"³\"]);\n\n while (i < w.length) {\n if (/\\p{L}/u.test(w[i])) {\n i++;\n } else if (DIGITS.has(w[i])) {\n // Could be a molecule like H2O - extend to include digits\n i++;\n } else if (PUNCT_INSIDE.has(w[i]) && i + 1 < w.length && /\\p{L}/u.test(w[i + 1])) {\n i++;\n } else {\n break;\n }\n }\n // Allow ending punctuation\n if (i < w.length && PUNCT_ENDING.has(w[i])) {\n i++;\n }\n const wordCandidate = w.slice(0, i);\n\n // Check if this is a chemical molecule (e.g., H2O, CO2, NaCl)\n if (isValidMolecule(wordCandidate)) {\n yield { kind: \"molecule\", text: wordCandidate };\n w = w.slice(i);\n continue;\n }\n\n yield { kind: \"word\", text: wordCandidate };\n w = w.slice(i);\n continue;\n }\n\n // Unknown character - emit as unknown\n yield { kind: \"unknown\", text: w[0] };\n w = w.slice(1);\n }\n}\n\n/**\n * Split text into rough tokens on whitespace, handling paragraph breaks\n */\nfunction* generateRoughTokens(text: string, replaceCompositeGlyphs: boolean): Generator<string> {\n let normalized = replaceCompositeGlyphs ? normalizeUnicode(text) : text;\n\n // Split on double newlines (paragraph breaks)\n const paragraphs = normalized.split(/\\n\\s*\\n/);\n let first = true;\n\n for (const paragraph of paragraphs) {\n if (!first) {\n // Yield empty string to signal sentence split\n yield \"\";\n }\n first = false;\n\n // Split on whitespace\n for (const word of paragraph.split(/\\s+/)) {\n if (word) {\n yield word;\n }\n }\n }\n}\n\n/**\n * Lexer: Convert text to initial token stream\n */\nexport function lex(text: string, replaceCompositeGlyphs = true): Token[] {\n const tokens: Token[] = [];\n\n for (const rawToken of generateRoughTokens(text, replaceCompositeGlyphs)) {\n for (const token of parseRawToken(rawToken)) {\n tokens.push(token);\n }\n }\n\n return tokens;\n}\n","/**\n * Common Icelandic abbreviations\n * Format: abbreviation → expansion\n *\n * This is a curated subset (~100) of common abbreviations.\n * The full Miðeind tokenizer has ~1500.\n */\nexport const ABBREVIATIONS: Record<string, string> = {\n // Titles and honorifics\n hr: \"herra\",\n \"hr.\": \"herra\",\n frú: \"frú\",\n \"frú.\": \"frú\",\n sr: \"séra\",\n \"sr.\": \"séra\",\n dr: \"doktor\",\n \"dr.\": \"doktor\",\n prof: \"prófessor\",\n \"prof.\": \"prófessor\",\n\n // Organizations\n hf: \"hlutafélag\",\n \"hf.\": \"hlutafélag\",\n ehf: \"einkahlutafélag\",\n \"ehf.\": \"einkahlutafélag\",\n ohf: \"opinbert hlutafélag\",\n \"ohf.\": \"opinbert hlutafélag\",\n sf: \"sameignarfélag\",\n \"sf.\": \"sameignarfélag\",\n slf: \"samlagsfélag\",\n \"slf.\": \"samlagsfélag\",\n ses: \"sjálfseignarstofnun\",\n \"ses.\": \"sjálfseignarstofnun\",\n\n // Common abbreviations\n ofl: \"og fleiri\",\n \"o.fl.\": \"og fleiri\",\n osfrv: \"og svo framvegis\",\n \"o.s.frv.\": \"og svo framvegis\",\n oþh: \"og þess háttar\",\n \"o.þ.h.\": \"og þess háttar\",\n þe: \"það er\",\n \"þ.e.\": \"það er\",\n þea: \"það er að segja\",\n \"þ.e.a.s.\": \"það er að segja\",\n sbr: \"samanber\",\n \"sbr.\": \"samanber\",\n skv: \"samkvæmt\",\n \"skv.\": \"samkvæmt\",\n mtt: \"með tilliti til\",\n \"m.t.t.\": \"með tilliti til\",\n ath: \"athugasemd\",\n \"ath.\": \"athugasemd\",\n gr: \"grein\",\n \"gr.\": \"grein\",\n mgr: \"málsgrein\",\n \"mgr.\": \"málsgrein\",\n tölul: \"töluliður\",\n \"tölul.\": \"töluliður\",\n nr: \"númer\",\n \"nr.\": \"númer\",\n sl: \"síðastliðinn\",\n \"sl.\": \"síðastliðinn\",\n nk: \"næstkomandi\",\n \"n.k.\": \"næstkomandi\",\n\n // Time-related\n kl: \"klukkan\",\n \"kl.\": \"klukkan\",\n ca: \"circa\",\n \"ca.\": \"circa\",\n\n // Academic/Professional\n bs: \"Bachelor of Science\",\n \"B.S.\": \"Bachelor of Science\",\n ms: \"Master of Science\",\n \"M.S.\": \"Master of Science\",\n ba: \"Bachelor of Arts\",\n \"B.A.\": \"Bachelor of Arts\",\n // Note: \"ma\" also means \"milljarður\" - using M.A. for Master of Arts\n \"M.A.\": \"Master of Arts\",\n phd: \"Doctor of Philosophy\",\n \"Ph.D.\": \"Doctor of Philosophy\",\n mba: \"Master of Business Administration\",\n MBA: \"Master of Business Administration\",\n\n // Places\n Rvk: \"Reykjavík\",\n \"Rvk.\": \"Reykjavík\",\n Akr: \"Akranes\",\n \"Akr.\": \"Akranes\",\n Ak: \"Akureyri\",\n \"Ak.\": \"Akureyri\",\n\n // Directions\n n: \"norður\",\n \"n.\": \"norður\",\n s: \"suður\",\n \"s.\": \"suður\",\n a: \"austur\",\n \"a.\": \"austur\",\n v: \"vestur\",\n \"v.\": \"vestur\",\n na: \"norðaustur\",\n \"n.a.\": \"norðaustur\",\n nv: \"norðvestur\",\n \"n.v.\": \"norðvestur\",\n sa: \"suðaustur\",\n \"s.a.\": \"suðaustur\",\n sv: \"suðvestur\",\n \"s.v.\": \"suðvestur\",\n\n // Measurements (that don't have SI unit meanings)\n þús: \"þúsund\",\n \"þús.\": \"þúsund\",\n millj: \"milljón\",\n \"millj.\": \"milljón\",\n mljó: \"milljón\",\n \"mljó.\": \"milljón\",\n ma: \"milljarður\",\n \"ma.\": \"milljarður\",\n mrð: \"milljarður\",\n \"mrð.\": \"milljarður\",\n};\n\n/**\n * Abbreviations that can end a sentence (followed by period)\n */\nexport const FINISHER_ABBREVIATIONS = new Set([\n \"o.fl\",\n \"o.s.frv\",\n \"o.þ.h\",\n \"þ.e\",\n \"þ.e.a.s\",\n \"m.t.t\",\n \"n.k\",\n]);\n\n/**\n * Check if an abbreviation exists\n */\nexport function hasAbbreviation(text: string): boolean {\n return text in ABBREVIATIONS;\n}\n\n/**\n * Get the expansion of an abbreviation\n */\nexport function getAbbreviationMeaning(text: string): string | undefined {\n return ABBREVIATIONS[text];\n}\n","/**\n * Particles: Coalesce abbreviations, currency+number, etc.\n *\n * This stage combines tokens that belong together:\n * - Abbreviation + period\n * - Currency symbol + number → amount\n * - Number + currency code → amount\n */\n\nimport type { Token } from \"../types.js\";\nimport { CURRENCY_SYMBOLS, CURRENCY_ABBREV, AMOUNT_ABBREV } from \"../data/units.js\";\nimport { ABBREVIATIONS } from \"../data/abbreviations.js\";\n\n/**\n * Process particles: combine related tokens\n */\nexport function processParticles(tokens: Token[]): Token[] {\n const result: Token[] = [];\n let i = 0;\n\n while (i < tokens.length) {\n const token = tokens[i];\n const next = tokens[i + 1];\n\n // Word + period → check if it's an abbreviation\n if (token.kind === \"word\" && next?.kind === \"punctuation\" && next.text === \".\") {\n const abbrevWithPeriod = token.text + \".\";\n if (abbrevWithPeriod in ABBREVIATIONS || abbrevWithPeriod in AMOUNT_ABBREV) {\n result.push({ kind: \"word\", text: abbrevWithPeriod });\n i += 2;\n continue;\n }\n }\n\n // Currency symbol + number → amount (e.g., $100)\n if (token.kind === \"punctuation\" && token.text in CURRENCY_SYMBOLS && next?.kind === \"number\") {\n const iso = CURRENCY_SYMBOLS[token.text];\n result.push({\n kind: \"amount\",\n text: token.text + next.text,\n value: next.value,\n currency: iso,\n });\n i += 2;\n continue;\n }\n\n // Number + currency code → amount (e.g., 100 USD, 100 kr.)\n if (token.kind === \"number\" && next?.kind === \"word\") {\n const currencyText = next.text;\n if (CURRENCY_ABBREV.has(currencyText)) {\n result.push({\n kind: \"amount\",\n text: token.text + \" \" + next.text,\n value: token.value,\n currency: currencyText,\n });\n i += 2;\n continue;\n }\n // Check for ISK abbreviations (kr., m.kr., etc.)\n if (currencyText in AMOUNT_ABBREV) {\n const multiplier = AMOUNT_ABBREV[currencyText];\n result.push({\n kind: \"amount\",\n text: token.text + \" \" + next.text,\n value: token.value * multiplier,\n currency: \"ISK\",\n });\n i += 2;\n continue;\n }\n }\n\n // Percent word after number\n if (\n token.kind === \"number\" &&\n next?.kind === \"word\" &&\n [\"prósent\", \"prósenta\", \"prósenti\", \"hundraðshluti\"].includes(next.text.toLowerCase())\n ) {\n result.push({\n kind: \"percent\",\n text: token.text + \" \" + next.text,\n value: token.value,\n });\n i += 2;\n continue;\n }\n\n // Date + time → timestamp\n if ((token.kind === \"date\" || token.kind === \"dateabs\") && next?.kind === \"time\") {\n result.push({\n kind: \"timestamp\",\n text: token.text + \" \" + next.text,\n year: token.year,\n month: token.month,\n day: token.day,\n hour: next.hour,\n minute: next.minute,\n second: next.second,\n });\n i += 2;\n continue;\n }\n\n // Default: pass through\n result.push(token);\n i++;\n }\n\n return result;\n}\n","/**\n * Sentences: Add sentence boundary markers\n *\n * This stage detects sentence boundaries and inserts S_BEGIN/S_END markers.\n */\n\nimport type { Token } from \"../types.js\";\nimport {\n END_OF_SENTENCE,\n SENTENCE_FINISHERS,\n PUNCT_COMBINATIONS,\n MONTHS,\n} from \"../data/constants.js\";\nimport { isRomanNumeral } from \"../data/patterns.js\";\nimport { CURRENCY_ABBREV } from \"../data/units.js\";\n\n/**\n * Check if the next token could be ending a sentence or starting a new one\n */\nfunction couldBeEndOfSentence(nextToken: Token): boolean {\n // Sentence markers definitely end/start\n if (nextToken.kind === \"s_end\" || nextToken.kind === \"s_split\") {\n return true;\n }\n\n // Uppercase word (except month names and roman numerals)\n if (nextToken.kind === \"word\" && nextToken.text.length > 0) {\n const firstChar = nextToken.text[0];\n if (firstChar === firstChar.toUpperCase() && firstChar !== firstChar.toLowerCase()) {\n // It's capitalized\n const lower = nextToken.text.toLowerCase();\n // Don't treat month names as sentence starters\n if (lower in MONTHS) return false;\n // Don't treat roman numerals as sentence starters\n if (isRomanNumeral(nextToken.text)) return false;\n // Don't treat currency abbreviations as sentence starters\n if (CURRENCY_ABBREV.has(nextToken.text)) return false;\n return true;\n }\n }\n\n return false;\n}\n\n/**\n * Add sentence boundary markers\n */\nexport function addSentenceMarkers(tokens: Token[]): Token[] {\n if (tokens.length === 0) return [];\n\n const result: Token[] = [];\n let inSentence = false;\n let i = 0;\n\n const beginSentence = (): Token => ({ kind: \"s_begin\", text: null });\n const endSentence = (): Token => ({ kind: \"s_end\", text: null });\n\n while (i < tokens.length) {\n const token = tokens[i];\n const next = tokens[i + 1];\n\n // Handle sentence split marker\n if (token.kind === \"s_split\") {\n if (inSentence) {\n result.push(endSentence());\n inSentence = false;\n }\n // Don't emit the split marker itself\n i++;\n continue;\n }\n\n // Start a new sentence if needed\n if (!inSentence) {\n result.push(beginSentence());\n inSentence = true;\n }\n\n // Check for sentence-ending punctuation\n if (token.kind === \"punctuation\" && END_OF_SENTENCE.has(token.normalized)) {\n // Handle ellipsis mid-sentence (don't end if next token doesn't look like sentence start)\n if (token.normalized === \"…\" && next && !couldBeEndOfSentence(next)) {\n result.push(token);\n i++;\n continue;\n }\n\n // Combine consecutive punctuation (??!, etc.)\n let combinedText = token.text;\n let j = i + 1;\n while (j < tokens.length) {\n const nextTok = tokens[j];\n if (nextTok.kind !== \"punctuation\") break;\n if (!PUNCT_COMBINATIONS.has(nextTok.normalized)) break;\n combinedText += nextTok.text;\n j++;\n }\n\n // Emit combined punctuation if any\n if (j > i + 1) {\n result.push({ ...token, text: combinedText });\n i = j;\n } else {\n result.push(token);\n i++;\n }\n\n // Collect any sentence finishers (closing quotes, brackets)\n while (i < tokens.length) {\n const tok = tokens[i];\n if (tok.kind !== \"punctuation\") break;\n if (!SENTENCE_FINISHERS.has(tok.normalized)) break;\n result.push(tok);\n i++;\n }\n\n // End the sentence\n result.push(endSentence());\n inSentence = false;\n continue;\n }\n\n // Regular token\n result.push(token);\n i++;\n }\n\n // Close any open sentence\n if (inSentence) {\n result.push(endSentence());\n }\n\n return result;\n}\n","/**\n * Phrases: Combine date+year, ordinal+month, clock+time, compounds, etc.\n *\n * This stage combines tokens that form higher-level constructs:\n * - \"5. mars\" → date\n * - \"2024\" after date → add year to date\n * - \"kl. 14:30\" → time with prefix\n * - \"1920 f.Kr.\" → year BCE\n * - \"stjórnskipunar- og eftirlitsnefnd\" → compound word\n */\n\nimport type { Token } from \"../types.js\";\nimport {\n MONTHS,\n MONTH_BLACKLIST,\n CE,\n BCE,\n CLOCK_ABBREVS,\n CLOCK_NUMBERS,\n HYPHEN,\n EN_DASH,\n} from \"../data/constants.js\";\nimport { FINISHER_ABBREVIATIONS } from \"../data/abbreviations.js\";\n\n/** Hyphens that can appear in compound words */\nconst COMPOSITE_HYPHENS = new Set([HYPHEN, EN_DASH]);\n\n/**\n * Get month number from token, or null if not a month\n */\nfunction getMonth(token: Token, afterOrdinal = false): number | null {\n if (token.kind !== \"word\") return null;\n // Check blacklist (Ágúst as a name)\n if (!afterOrdinal && MONTH_BLACKLIST.has(token.text)) return null;\n const lower = token.text.toLowerCase();\n return MONTHS[lower] ?? null;\n}\n\n/**\n * Check if a token is a composite hyphen (- or –)\n */\nfunction isCompositeHyphen(token: Token): boolean {\n return token.kind === \"punctuation\" && COMPOSITE_HYPHENS.has(token.text);\n}\n\n/**\n * Try to parse a compound word pattern starting at index i.\n * Pattern: (word- [,])+ (og|eða) word\n * Examples:\n * - \"stjórnskipunar- og eftirlitsnefnd\"\n * - \"dómsmála-, viðskipta- og iðnaðarráðherra\"\n *\n * Returns [combined token, new index] or null if no match.\n */\nfunction tryParseCompound(tokens: Token[], startIndex: number): [Token, number] | null {\n const prefixes: Token[] = [];\n let i = startIndex;\n\n // Accumulate prefix patterns: word + hyphen [+ comma]\n while (i < tokens.length) {\n const word = tokens[i];\n const hyphen = tokens[i + 1];\n\n // Must be word followed by composite hyphen\n if (word?.kind !== \"word\" || !hyphen || !isCompositeHyphen(hyphen)) {\n break;\n }\n\n prefixes.push(word);\n prefixes.push(hyphen);\n i += 2;\n\n // Check for optional comma\n const maybeComma = tokens[i];\n if (maybeComma?.kind === \"punctuation\" && maybeComma.text === \",\") {\n prefixes.push(maybeComma);\n i++;\n }\n }\n\n // Must have at least one prefix\n if (prefixes.length === 0) {\n return null;\n }\n\n // Next must be \"og\" or \"eða\"\n const conjunction = tokens[i];\n if (\n !conjunction ||\n conjunction.kind !== \"word\" ||\n (conjunction.text.toLowerCase() !== \"og\" && conjunction.text.toLowerCase() !== \"eða\")\n ) {\n return null;\n }\n\n // After conjunction must be a word (the suffix)\n const suffix = tokens[i + 1];\n if (!suffix || suffix.kind !== \"word\") {\n return null;\n }\n\n // Build the combined text\n // Join all parts: \"stjórnskipunar\", \"-\", \"og\", \"eftirlitsnefnd\"\n // Then normalize spacing: remove space before hyphen/comma\n const parts = [...prefixes, conjunction, suffix];\n let text = parts.map((t) => t.text).join(\" \");\n text = text.replace(/ -/g, \"-\").replace(/ ,/g, \",\");\n\n return [{ kind: \"word\", text }, i + 2];\n}\n\n/**\n * Process phrases: combine date/time constructs and compound words\n */\nexport function processPhrases(tokens: Token[]): Token[] {\n const result: Token[] = [];\n let i = 0;\n\n while (i < tokens.length) {\n const token = tokens[i];\n const next = tokens[i + 1];\n\n // Try compound word pattern first\n const compound = tryParseCompound(tokens, i);\n if (compound) {\n result.push(compound[0]);\n i = compound[1];\n continue;\n }\n\n // Word + \".\" → check if it's an abbreviation that ends sentences\n if (token.kind === \"word\" && next?.kind === \"punctuation\" && next.text === \".\") {\n const base = token.text.replace(/\\.$/, \"\");\n if (FINISHER_ABBREVIATIONS.has(base)) {\n // Coalesce abbreviation with period\n result.push({ kind: \"word\", text: token.text + \".\" });\n i += 2;\n continue;\n }\n }\n\n // Year/number + \"e.Kr.\" or \"f.Kr.\" → year with era\n if ((token.kind === \"year\" || token.kind === \"number\") && next?.kind === \"word\") {\n const val = token.kind === \"year\" ? token.value : token.value;\n let newVal: number | null = null;\n if (BCE.has(next.text)) {\n newVal = -val;\n } else if (CE.has(next.text)) {\n newVal = val;\n }\n if (newVal !== null) {\n let text = token.text + \" \" + next.text;\n i += 2;\n // Handle trailing period\n if (tokens[i]?.kind === \"punctuation\" && tokens[i].text === \".\") {\n text += \".\";\n i++;\n }\n result.push({ kind: \"year\", text, value: newVal });\n continue;\n }\n }\n\n // Ordinal/number + month name → date\n if ((token.kind === \"ordinal\" || token.kind === \"number\") && next?.kind === \"word\") {\n const month = getMonth(next, true);\n if (month !== null) {\n const day = token.kind === \"ordinal\" ? token.value : token.value;\n result.push({\n kind: \"daterel\",\n text: token.text + \" \" + next.text,\n year: 0,\n month,\n day,\n });\n i += 2;\n continue;\n }\n }\n\n // Date + year → add year to date\n if (\n (token.kind === \"date\" || token.kind === \"daterel\") &&\n token.year === 0 &&\n next?.kind === \"number\"\n ) {\n const year = next.value;\n if (year >= 1776 && year <= 2100) {\n result.push({\n kind: \"dateabs\",\n text: token.text + \" \" + next.text,\n year,\n month: token.month,\n day: token.day,\n });\n i += 2;\n continue;\n }\n }\n\n // Date + year token\n if (\n (token.kind === \"date\" || token.kind === \"daterel\") &&\n token.year === 0 &&\n next?.kind === \"year\"\n ) {\n result.push({\n kind: \"dateabs\",\n text: token.text + \" \" + next.text,\n year: next.value,\n month: token.month,\n day: token.day,\n });\n i += 2;\n continue;\n }\n\n // Clock abbreviation + time → time (keep as-is but combine text)\n if (\n token.kind === \"word\" &&\n CLOCK_ABBREVS.has(token.text.toLowerCase()) &&\n next?.kind === \"time\"\n ) {\n result.push({\n ...next,\n text: token.text + \" \" + next.text,\n });\n i += 2;\n continue;\n }\n\n // Clock abbreviation + spelled-out time (kl. tvö → time)\n if (\n token.kind === \"word\" &&\n CLOCK_ABBREVS.has(token.text.toLowerCase()) &&\n next?.kind === \"word\"\n ) {\n const timeValue = CLOCK_NUMBERS[next.text.toLowerCase()];\n if (timeValue) {\n result.push({\n kind: \"time\",\n text: token.text + \" \" + next.text,\n hour: timeValue[0],\n minute: timeValue[1],\n second: timeValue[2],\n });\n i += 2;\n continue;\n }\n }\n\n // Default: pass through\n result.push(token);\n i++;\n }\n\n return result;\n}\n","/**\n * Main tokenize function\n *\n * Chains the pipeline stages to produce a stream of tokens.\n */\n\nimport type { Token, TokenizeOptions } from \"./types.js\";\nimport { lex } from \"./pipeline/lexer.js\";\nimport { processParticles } from \"./pipeline/particles.js\";\nimport { addSentenceMarkers } from \"./pipeline/sentences.js\";\nimport { processPhrases } from \"./pipeline/phrases.js\";\n\n/**\n * Tokenize Icelandic text into an array of tokens.\n *\n * @param text - The text to tokenize\n * @param options - Tokenization options\n * @returns Array of tokens\n *\n * @example\n * ```ts\n * const tokens = tokenize(\"Þetta er próf.\");\n * // → [word(\"Þetta\"), word(\"er\"), word(\"próf\"), punctuation(\".\")]\n *\n * // With sentence markers:\n * const tokens = tokenize(\"Þetta er próf.\", { includeSentenceMarkers: true });\n * // → [s_begin, word(\"Þetta\"), word(\"er\"), word(\"próf\"), punctuation(\".\"), s_end]\n * ```\n */\nexport function tokenize(text: string, options: TokenizeOptions = {}): Token[] {\n const { replaceCompositeGlyphs = true, includeSentenceMarkers = false } = options;\n\n // Pipeline:\n // 1. Lexer: split text into initial tokens\n let tokens = lex(text, replaceCompositeGlyphs);\n\n // 2. Particles: coalesce abbreviations, currency+number\n tokens = processParticles(tokens);\n\n // 3. Phrases: combine date+year, ordinal+month, etc.\n tokens = processPhrases(tokens);\n\n // 4. Sentences: add boundary markers if requested\n if (includeSentenceMarkers) {\n tokens = addSentenceMarkers(tokens);\n } else {\n // Filter out internal sentence markers (s_split)\n tokens = tokens.filter((t) => t.kind !== \"s_split\");\n }\n\n return tokens;\n}\n","/**\n * Split text into sentences\n *\n * A higher-level function that returns sentence strings rather than tokens.\n */\n\nimport { tokenize } from \"./tokenize.js\";\nimport type { Token } from \"./types.js\";\n\n/**\n * Split Icelandic text into an array of sentence strings.\n *\n * @param text - The text to split\n * @returns Array of sentence strings\n *\n * @example\n * ```ts\n * const sentences = splitIntoSentences(\"Þetta er fyrsta setning. Þetta er önnur.\");\n * // → [\"Þetta er fyrsta setning.\", \"Þetta er önnur.\"]\n * ```\n */\nexport function splitIntoSentences(text: string): string[] {\n const tokens = tokenize(text, { includeSentenceMarkers: true });\n const sentences: string[] = [];\n let currentSentence: string[] = [];\n\n for (const token of tokens) {\n if (token.kind === \"s_begin\") {\n currentSentence = [];\n } else if (token.kind === \"s_end\") {\n if (currentSentence.length > 0) {\n sentences.push(joinTokens(currentSentence));\n }\n currentSentence = [];\n } else if (token.text !== null) {\n currentSentence.push(getTokenText(token));\n }\n }\n\n // Handle any remaining tokens (shouldn't happen with proper markers)\n if (currentSentence.length > 0) {\n sentences.push(joinTokens(currentSentence));\n }\n\n return sentences;\n}\n\n/**\n * Get display text from a token\n */\nfunction getTokenText(token: Token): string {\n if (token.kind === \"punctuation\") {\n return token.normalized;\n }\n return token.text ?? \"\";\n}\n\n/**\n * Join token texts with appropriate spacing\n */\nfunction joinTokens(texts: string[]): string {\n if (texts.length === 0) return \"\";\n\n let result = texts[0];\n\n for (let i = 1; i < texts.length; i++) {\n const prev = texts[i - 1];\n const curr = texts[i];\n\n // Determine if we need a space\n const needsSpace = shouldAddSpace(prev, curr);\n if (needsSpace) {\n result += \" \" + curr;\n } else {\n result += curr;\n }\n }\n\n return result;\n}\n\n/**\n * Determine if a space should be added between two tokens\n */\nfunction shouldAddSpace(prev: string, curr: string): boolean {\n if (!prev || !curr) return false;\n\n const lastChar = prev[prev.length - 1];\n const firstChar = curr[0];\n\n // Opening punctuation: no space after\n // ( [ „ ‚ « <\n const openingPunct = new Set([\"(\", \"[\", \"\\u201E\", \"\\u201A\", \"\\u00AB\", \"<\"]);\n if (openingPunct.has(lastChar)) return false;\n\n // Closing/ending punctuation: no space before\n // . , ; : ! ? ) ] \" ' » > …\n const closingPunct = new Set([\n \".\",\n \",\",\n \";\",\n \":\",\n \"!\",\n \"?\",\n \")\",\n \"]\",\n \"\\u201C\", // \"\n \"\\u2019\", // '\n \"\\u00BB\", // »\n \">\",\n \"\\u2026\", // …\n ]);\n if (closingPunct.has(firstChar)) return false;\n\n // Hyphen handling\n if (lastChar === \"-\" || firstChar === \"-\") return false;\n\n return true;\n}\n"],"mappings":"AAGA,MAAa,EAA+C,CAE1D,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IAEX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IACX,GAAW,IAEX,IAAU,GACV,IAAU,GACV,IAAU,GACX,CAQY,EAAU,MAQV,EAAmB,eAMA,GAAuB,EAErD,EAAmB,GAnDrB,MAqDa,EAAkB,IAAI,IAAI,qDAAY,CACtC,EAAuB,IAAI,IAAI,EAAiB,CAChD,EAAwB,IAAI,IAAI,qBAAkB,CAClD,EAAuB,IAAI,IAAI,cAAiB,CAMhD,EAAgB,QAEhB,EAAgB,SAahB,EAAkB,IAAI,IAAI,CAAC,IAAK,IAAK,IAAK,IAAS,CAAC,CACpD,EAAqB,IAAI,IAAI,CACxC,IACA,IACA,IACA,IACA,IACA,IACA,IACA,MACD,CAAC,CAeW,EAAqB,IAAI,IAAI,CAAC,IAAK,IAAK,IAAS,CAAC,CAKlD,EAAS,IAAI,IAAI,aAAa,CAC9B,EAAc,IAAI,IAAI,CAAC,IAAK,IAAI,CAAC,CAKjC,EAAiC,CAC5C,OAAe,EACf,QAAgB,EAChB,QAAgB,EAChB,SAAiB,EACjB,KAAM,EACN,MAAc,EACd,OAAe,EACf,IAAY,EACZ,KAAa,EACb,KAAkB,EAClB,MAAmB,EACnB,KAAkB,EAClB,MAAmB,EACnB,MAAmB,EACnB,OAAoB,EACpB,UAAW,EACX,WAAY,EACZ,QAAgB,GAChB,SAAiB,GACjB,SAAiB,GACjB,UAAkB,GAClB,SAAU,GACV,UAAW,GAEX,OAAQ,EACR,OAAQ,EACR,OAAQ,EACR,OAAQ,EACR,OAAa,EACb,OAAa,EACb,MAAY,EACZ,OAAkB,EAClB,OAAQ,EACR,QAAS,EACT,OAAQ,GACR,OAAa,GACb,OAAQ,GACR,IAAK,EACL,IAAK,EACL,IAAK,EACL,IAAK,EACL,IAAY,EACZ,IAAY,EACZ,GAAW,EACX,IAAiB,EACjB,IAAK,EACL,KAAM,EACN,IAAK,GACL,IAAY,GACZ,IAAK,GACN,CAKY,EAAkB,IAAI,IAAI,CAAC,QAAkB,CAAC,CAK9C,EAAgB,CAAC,EAAG,GAAI,GAAI,GAAI,GAAI,GAAI,GAAI,GAAI,GAAI,GAAI,GAAI,GAAI,GAAG,CAKnE,EAAgB,IAAI,IAAI,CAAC,KAAM,MAAO,UAAU,CAAC,CAKjD,GAA0D,CACrE,KAAM,CAAC,EAAG,EAAG,EAAE,CACf,IAAY,CAAC,EAAG,EAAG,EAAE,CACrB,KAAkB,CAAC,EAAG,EAAG,EAAE,CAC3B,OAAe,CAAC,EAAG,EAAG,EAAE,CACxB,KAAM,CAAC,EAAG,EAAG,EAAE,CACf,IAAK,CAAC,EAAG,EAAG,EAAE,CACd,IAAY,CAAC,EAAG,EAAG,EAAE,CACrB,KAAa,CAAC,EAAG,EAAG,EAAE,CACtB,IAAY,CAAC,EAAG,EAAG,EAAE,CACrB,IAAY,CAAC,GAAI,EAAG,EAAE,CACtB,OAAQ,CAAC,GAAI,EAAG,EAAE,CAClB,KAAa,CAAC,GAAI,EAAG,EAAE,CACvB,SAAiB,CAAC,GAAI,GAAI,EAAE,CAC5B,QAAqB,CAAC,EAAG,GAAI,EAAE,CAC/B,SAA2B,CAAC,EAAG,GAAI,EAAE,CACrC,WAAwB,CAAC,EAAG,GAAI,EAAE,CAClC,SAAiB,CAAC,EAAG,GAAI,EAAE,CAC3B,QAAgB,CAAC,EAAG,GAAI,EAAE,CAC1B,QAAqB,CAAC,EAAG,GAAI,EAAE,CAC/B,SAAsB,CAAC,EAAG,GAAI,EAAE,CAChC,QAAqB,CAAC,EAAG,GAAI,EAAE,CAC/B,QAAqB,CAAC,EAAG,GAAI,EAAE,CAC/B,WAAmB,CAAC,GAAI,GAAI,EAAE,CAC9B,SAAsB,CAAC,GAAI,GAAI,EAAE,CAClC,CAKY,EAAK,IAAI,IAAI,CAAC,OAAQ,QAAQ,CAAC,CAC/B,EAAM,IAAI,IAAI,CAAC,OAAQ,QAAQ,CAAC,CACvB,IAAI,IAAI,CAAC,GAAG,EAAI,GAAG,EAAI,CAAC,CCxN9C,MAAa,EAA2C,CACtD,EAAG,MACH,IAAK,MACL,IAAK,MACL,IAAK,MACL,IAAK,MACN,CAKY,EAAkB,IAAI,IAAI,CACrC,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACA,MACD,CAAC,CAKW,EAAwC,CACnD,MAAO,EACP,GAAI,EACJ,OAAQ,EACR,QAAS,IACT,OAAQ,IACR,UAAW,IACX,SAAU,IACV,QAAS,IACT,OAAQ,IACR,OAAQ,IACR,IAAK,IACL,YAAa,IACb,WAAY,IACZ,SAAU,IACV,QAAS,IACT,WAAY,IACZ,UAAW,IACZ,CAQY,EAA6C,CAExD,EAAG,CAAC,IAAK,EAAI,CACb,GAAI,CAAC,IAAK,KAAO,CACjB,GAAI,CAAC,IAAK,KAAO,CACjB,GAAI,CAAC,IAAK,IAAO,CACjB,GAAI,CAAC,IAAK,IAAO,CACjB,GAAI,CAAC,IAAK,IAAM,CAChB,GAAI,CAAC,IAAK,MAAO,CACjB,GAAI,CAAC,IAAK,QAAQ,CAElB,KAAM,CAAC,KAAM,EAAI,CACjB,GAAI,CAAC,KAAM,EAAI,CACf,MAAO,CAAC,KAAM,IAAM,CACpB,MAAO,CAAC,KAAM,IAAO,CACrB,GAAI,CAAC,KAAM,IAAM,CAEjB,KAAM,CAAC,KAAM,EAAI,CACjB,MAAO,CAAC,KAAM,KAAO,CACrB,MAAO,CAAC,KAAM,IAAM,CACpB,EAAG,CAAC,KAAM,KAAO,CACjB,IAAK,CAAC,KAAM,KAAO,CACnB,GAAI,CAAC,KAAM,KAAO,CAClB,GAAI,CAAC,KAAM,KAAO,CAClB,GAAI,CAAC,KAAM,KAAO,CAClB,IAAK,CAAC,KAAM,UAAW,CACvB,IAAK,CAAC,KAAM,cAAiB,CAE7B,EAAG,CAAC,IAAK,EAAI,CACb,KAAM,CAAC,IAAK,EAAI,CAEhB,EAAG,CAAC,KAAM,KAAO,CACjB,GAAI,CAAC,KAAM,KAAO,CAClB,GAAI,CAAC,KAAM,EAAI,CACf,EAAG,CAAC,KAAM,IAAM,CAChB,GAAI,CAAC,KAAM,KAAO,CAClB,GAAI,CAAC,KAAM,KAAO,CAClB,GAAI,CAAC,KAAM,IAAM,CACjB,GAAI,CAAC,KAAM,QAAS,CAEpB,EAAG,CAAC,IAAK,EAAI,CACb,GAAI,CAAC,IAAK,KAAO,CACjB,GAAI,CAAC,IAAK,KAAO,CACjB,KAAM,CAAC,IAAK,KAAO,CACnB,IAAK,CAAC,IAAK,GAAK,CAEhB,EAAG,CAAC,IAAK,EAAI,CACb,GAAI,CAAC,IAAK,IAAM,CAEhB,GAAI,CAAC,IAAK,EAAI,CACd,EAAG,CAAC,IAAK,EAAI,CACb,GAAI,CAAC,IAAK,IAAM,CAChB,GAAI,CAAC,IAAK,IAAM,CAChB,GAAI,CAAC,IAAK,IAAM,CAChB,GAAI,CAAC,IAAK,aAAO,CACjB,IAAK,CAAC,IAAK,KAAM,CACjB,IAAK,CAAC,IAAK,KAAM,CACjB,KAAM,CAAC,IAAK,KAAM,CAClB,KAAM,CAAC,IAAK,KAAM,CAClB,KAAM,CAAC,IAAK,KAAK,CACjB,IAAK,CAAC,IAAK,MAAM,CAEjB,EAAG,CAAC,IAAK,EAAI,CACb,GAAI,CAAC,IAAK,KAAO,CACjB,GAAI,CAAC,IAAK,IAAM,CAChB,GAAI,CAAC,IAAK,IAAM,CAChB,GAAI,CAAC,IAAK,IAAM,CAChB,GAAI,CAAC,IAAK,aAAO,CAEjB,EAAG,CAAC,IAAK,EAAI,CACb,GAAI,CAAC,IAAK,KAAO,CACjB,GAAI,CAAC,IAAK,IAAM,CAEhB,EAAG,CAAC,IAAK,EAAI,CACb,GAAI,CAAC,IAAK,KAAO,CAEjB,GAAI,CAAC,KAAM,EAAI,CACf,IAAK,CAAC,KAAM,IAAM,CAClB,IAAK,CAAC,KAAM,IAAM,CAClB,IAAK,CAAC,KAAM,IAAM,CAElB,GAAI,CAAC,KAAM,EAAI,CACf,IAAK,CAAC,KAAM,IAAM,CAElB,IAAK,CAAC,IAAK,EAAI,CAEf,IAAK,CAAC,IAAK,EAAI,CACf,IAAK,CAAC,IAAK,GAAI,CAChB,CAEY,EAAe,IAAI,IAAI,OAAO,KAAK,EAAS,CAAC,CAK1D,SAAS,GAA0B,CAEjC,IAAM,EADQ,OAAO,KAAK,EAAS,CAAC,MAAM,EAAG,IAAM,EAAE,OAAS,EAAE,OAAO,CAChD,IAAK,GAAS,CACnC,IAAM,EAAU,EAAK,QAAQ,sBAAuB,OAAO,CAE3D,OAAO,EAAK,EAAK,OAAS,GAAG,MAAM,WAAW,CAAG,GAAG,EAAQ,SAAW,GACvE,CACF,OAAW,OAAO,KAAK,EAAS,KAAK,IAAI,CAAC,GAAI,IAAI,CAGpD,MAAa,EAAiB,GAAiB,CAK/C,SAAS,IAA6B,CAEpC,IAAM,EADU,OAAO,KAAK,EAAiB,CAAC,MAAM,EAAG,IAAM,EAAE,OAAS,EAAE,OAAO,CACxD,IAAK,GAAM,EAAE,QAAQ,sBAAuB,OAAO,CAAC,CAC7E,OAAW,OAAO,KAAK,EAAS,KAAK,IAAI,CAAC,GAAI,IAAI,CAGtB,IAAoB,CAKlD,SAAS,IAAiC,CAIxC,IAAM,EAHW,CAAC,GAAG,OAAO,KAAK,EAAS,CAAE,GAAG,OAAO,KAAK,EAAiB,CAAC,CAAC,MAC3E,EAAG,IAAM,EAAE,OAAS,EAAE,OACxB,CACyB,IAAK,GAAS,CACtC,IAAM,EAAU,EAAK,QAAQ,sBAAuB,OAAO,CAC3D,OAAO,EAAK,EAAK,OAAS,GAAG,MAAM,WAAW,CAAG,GAAG,EAAQ,SAAW,GACvE,CACF,OAAW,OAAO,IAAI,EAAS,KAAK,IAAI,CAAC,IAAI,CAGrB,IAAwB,CCnMlD,MAAa,GAAc,2CACd,GAAW,mCACX,GAAU,2BAGV,GAAW,uCACX,GAAW,+CACX,GAAU,0BACV,EAAU,4BAWV,EAAqB,0BAGrB,EAAQ,2CAGR,EAAa,mDAGb,EAAS,gEAGT,EAAU,SAGV,EAAW,gBAGX,EAAgB,2DAkF7B,SAAgB,EAAe,EAAoB,CACjD,OAAO,EAAc,KAAK,EAAE,CAM9B,MAAa,EAAgB,+CAMhB,EAAW,sCAIX,EAAM,0BAIN,EAAgB,8BAIhB,EAAgB,qEAChB,EAAmB,6DAInB,EAAiB,6BACjB,EAAgB,4BAChB,EAAgB,4BAKhB,EAAkB,IAAI,IAAI,gWAuHtC,CAAC,CAMF,SAAgB,GAAkB,EAAyB,CAIzD,GAHI,EAAO,SAAW,IAGlB,CAAC,WAAW,KAAK,EAAO,CAAE,MAAO,GAGrC,IAAM,EAAK,SAAS,EAAO,GAAI,GAAG,CAC5B,EAAK,SAAS,EAAO,GAAI,GAAG,CAC5B,EAAK,SAAS,EAAO,GAAI,GAAG,CAC5B,EAAK,SAAS,EAAO,GAAI,GAAG,CAC5B,EAAK,SAAS,EAAO,GAAI,GAAG,CAC5B,EAAK,SAAS,EAAO,GAAI,GAAG,CAC5B,EAAK,SAAS,EAAO,GAAI,GAAG,CAC5B,EAAK,SAAS,EAAO,GAAI,GAAG,CAC5B,EAAa,SAAS,EAAO,GAAI,GAAG,CACpC,EAAU,SAAS,EAAO,GAAI,GAAG,CAGvC,GAAI,IAAY,GAAK,IAAY,EAAG,MAAO,GAG3C,IAAM,EAAM,EAAK,GAAK,EAChB,EAAQ,EAAK,GAAK,EAIlB,EADY,EAAM,GACM,EAAM,GAAK,EAEzC,GADI,EAAY,GAAK,EAAY,IAC7B,EAAQ,GAAK,EAAQ,GAAI,MAAO,GAIpC,IAAM,GADM,EAAI,EAAK,EAAI,EAAK,EAAI,EAAK,EAAI,EAAK,EAAI,EAAK,EAAI,EAAK,EAAI,EAAK,EAAI,GACvD,GAClB,EAAW,IAAc,EAAI,EAAI,GAAK,EAK5C,OAFI,IAAa,GAAW,GAErB,IAAe,EAQxB,SAAgB,EAAgB,EAAoB,CAElD,GAAI,CAAC,EAAS,KAAK,EAAE,CAAE,MAAO,GAG9B,IAAM,EAAe,sBACjB,EACA,EAAe,EAEnB,MAAQ,EAAQ,EAAa,KAAK,EAAE,IAAM,MAAM,CAC9C,IAAM,EAAU,EAAM,GACtB,GAAI,CAAC,EAAgB,IAAI,EAAQ,CAC/B,MAAO,GAET,IAIF,OAAO,GAAgB,ECpSzB,SAAgB,EAAiB,EAAsB,CACrD,IAAI,EAAS,EACb,IAAK,GAAM,CAAC,EAAM,KAAO,OAAO,QAAQ,EAAqB,CAC3D,EAAS,EAAO,WAAW,EAAM,EAAG,CAEtC,OAAO,EAMT,SAAS,EAAY,EAAW,EAAW,EAAoB,CAS7D,MAJA,EAJI,EAAI,MAAQ,EAAI,MAChB,EAAI,GAAK,EAAI,IACb,EAAI,GAAK,EAAI,EAAc,IAE3B,IAAM,GAAK,IAAM,IAEf,EADY,EAAI,GAAM,GAAK,EAAI,KAAQ,GAAM,EAAI,KAAQ,IASjE,SAAS,GAAa,EAA+B,CAInD,OAHI,EAAqB,IAAI,EAAK,CAAS,OACvC,EAAsB,IAAI,EAAK,CAAS,QACxC,EAAqB,IAAI,EAAK,CAAS,OACpC,SAMT,SAAS,EAAM,EAAc,EAA4B,CACvD,IAAM,EAAO,GAAc,EAE3B,MAAO,CAAE,KAAM,cAAe,OAAM,WAAY,EAAM,SADrC,EAAK,SAAW,EAAI,GAAa,EAAK,CAAG,SACM,CAMlE,SAAS,EAAY,EAA4B,CAE/C,IAAI,EAAQ,EAAc,KAAK,EAAE,CACjC,GAAI,EAAO,CACT,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAK,SAAS,EAAM,GAAI,GAAG,CAC3B,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAK,SAAS,EAAM,GAAI,GAAG,CAC3B,EAAI,SAAS,EAAM,GAAI,GAAG,CAChC,GAAI,EAAY,EAAG,EAAI,EAAE,EAAI,GAAK,GAAK,EAAI,IAAM,GAAM,GAAK,EAAK,IAAM,GAAK,GAAK,EAAI,GACnF,MAAO,CACL,CACE,KAAM,YACN,KAAM,EAAM,GACZ,KAAM,EACN,MAAO,EACP,IAAK,EACL,KAAM,EACN,OAAQ,EACR,OAAQ,EACT,CACD,EAAM,GAAG,OACV,CAML,GADA,EAAQ,EAAiB,KAAK,EAAE,CAC5B,EAAO,CACT,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAK,SAAS,EAAM,GAAI,GAAG,CAC3B,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAK,SAAS,EAAM,GAAI,GAAG,CACjC,GAAI,EAAY,EAAG,EAAI,EAAE,EAAI,GAAK,GAAK,EAAI,IAAM,GAAM,GAAK,EAAK,GAC/D,MAAO,CACL,CACE,KAAM,YACN,KAAM,EAAM,GACZ,KAAM,EACN,MAAO,EACP,IAAK,EACL,KAAM,EACN,OAAQ,EACR,OAAQ,EACT,CACD,EAAM,GAAG,OACV,CAML,GADA,EAAQ,GAAY,KAAK,EAAE,CACvB,EAAO,CACT,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAChC,GAAI,GAAK,GAAK,EAAI,IAAM,GAAK,GAAK,EAAI,IAAM,GAAK,GAAK,EAAI,GACxD,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAM,EAAM,GAAI,KAAM,EAAG,OAAQ,EAAG,OAAQ,EAAG,CAAE,EAAM,GAAG,OAAO,CAM7F,GADA,EAAQ,GAAS,KAAK,EAAE,CACpB,EAAO,CACT,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAChC,GAAI,GAAK,GAAK,EAAI,IAAM,GAAK,GAAK,EAAI,IAAM,GAAK,GAAK,EAAI,GACxD,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAM,EAAM,GAAI,KAAM,EAAG,OAAQ,EAAG,OAAQ,EAAG,CAAE,EAAM,GAAG,OAAO,CAM7F,GADA,EAAQ,GAAQ,KAAK,EAAE,CACnB,EAAO,CACT,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAChC,GAAI,GAAK,GAAK,EAAI,IAAM,GAAK,GAAK,EAAI,GACpC,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAM,EAAM,GAAI,KAAM,EAAG,OAAQ,EAAG,OAAQ,EAAG,CAAE,EAAM,GAAG,OAAO,CAM7F,GADA,EAAQ,GAAS,KAAK,EAAE,CACpB,EAAO,CACT,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAChC,GAAI,EAAY,EAAG,EAAG,EAAE,CACtB,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAM,EAAM,GAAI,KAAM,EAAG,MAAO,EAAG,IAAK,EAAG,CAAE,EAAM,GAAG,OAAO,CAMzF,GADA,EAAQ,EAAI,KAAK,EAAE,CACf,EAAO,CACT,IAAM,EAAS,EAAM,GAAK,EAAM,GAChC,GAAI,GAAkB,EAAO,CAC3B,MAAO,CAAC,CAAE,KAAM,MAAO,KAAM,EAAM,GAAI,MAAO,EAAQ,CAAE,EAAM,GAAG,OAAO,CAO5E,GADA,EAAQ,EAAc,KAAK,EAAE,CACzB,EACF,MAAO,CAAC,CAAE,KAAM,eAAgB,KAAM,EAAM,GAAI,CAAE,EAAM,GAAG,OAAO,CAKpE,IAAM,EAAa,EAAE,MAAM,iBAAiB,CAC5C,GAAI,EAAY,CACd,IAAM,EAAS,EAAW,GAC1B,MAAO,CAAC,CAAE,KAAM,QAAS,KAAM,EAAW,GAAI,GAAI,GAAI,SAAQ,CAAE,EAAW,GAAG,OAAO,CAKvF,GADA,EAAQ,GAAS,KAAK,EAAE,CACpB,EAAO,CACT,IAAI,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAS9B,GAPI,GAAK,KACP,GAAK,EAAI,GAAK,KAAO,KAGnB,EAAI,IAAM,GAAK,KACjB,CAAC,EAAG,GAAK,CAAC,EAAG,EAAE,EAEb,EAAY,EAAG,EAAG,EAAE,CACtB,MAAO,CAAC,CAAE,KAAM,OAAQ,KAAM,EAAM,GAAI,KAAM,EAAG,MAAO,EAAG,IAAK,EAAG,CAAE,EAAM,GAAG,OAAO,CAMzF,GADA,EAAQ,GAAQ,KAAK,EAAE,CACnB,EAAO,CACT,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAChC,GAAI,GAAK,GAAK,GAAK,IAAM,GAAK,GAAK,GAAK,EAAc,GACpD,MAAO,CAAC,CAAE,KAAM,UAAW,KAAM,EAAM,GAAI,KAAM,EAAG,MAAO,EAAG,IAAK,EAAG,CAAE,EAAM,GAAG,OAAO,CAM5F,GADA,EAAQ,EAAQ,KAAK,EAAE,CACnB,EAAO,CACT,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAC1B,EAAI,SAAS,EAAM,GAAI,GAAG,CAChC,GAAI,GAAK,MAAQ,GAAK,MAAQ,GAAK,GAAK,GAAK,GAC3C,MAAO,CAAC,CAAE,KAAM,UAAW,KAAM,EAAM,GAAI,KAAM,EAAG,MAAO,EAAG,IAAK,EAAG,CAAE,EAAM,GAAG,OAAO,CAM5F,GADA,EAAQ,EAAmB,KAAK,EAAE,CAC9B,EAAO,CACT,IAAM,EAAS,EAAM,GAErB,GAAI,CAAC,EAAa,IAAI,EAAO,CAAE,CAC7B,IAAM,EAAI,SAAS,EAAM,GAAI,GAAG,CAChC,MAAO,CAAC,CAAE,KAAM,aAAc,KAAM,EAAM,GAAI,MAAO,EAAG,SAAQ,CAAE,EAAM,GAAG,OAAO,EAKtF,IAAM,EAAiB,EAAE,MAAM,mCAAmC,CAClE,GAAI,EAAgB,CAClB,IAAM,EAAU,EAAe,GACzB,EAAO,EAAE,MAAM,EAAQ,OAAO,CAC9B,EAAY,EAAe,KAAK,EAAK,CAC3C,GAAI,EAAW,CACb,IAAM,EAAO,EAAU,GACjB,EAAW,EAAU,EACrB,EAAQ,WAAW,EAAQ,QAAQ,MAAO,GAAG,CAAC,QAAQ,IAAK,IAAI,CAAC,CACtE,GAAI,KAAQ,EAEV,MAAO,CAAC,CAAE,KAAM,SAAU,KAAM,EAAU,QAAO,SADrC,EAAiB,GACmC,CAAE,EAAS,OAAO,CAEpF,GAAM,CAAC,GAAY,EAAS,GAI5B,OAHI,IAAS,KAAO,IAAS,IACpB,CAAC,CAAE,KAAM,UAAW,KAAM,EAAU,QAAO,CAAE,EAAS,OAAO,CAE/D,CAAC,CAAE,KAAM,cAAe,KAAM,EAAU,QAAO,KAAM,EAAU,CAAE,EAAS,OAAO,EAM5F,IAAM,EAAS,EAAE,MAAM,yCAAyC,CAChE,GAAI,GAAU,EAAO,GAAG,SAAS,IAAI,CAAE,CACrC,IAAM,EAAQ,WAAW,EAAO,GAAG,QAAQ,MAAO,GAAG,CAAC,QAAQ,IAAK,IAAI,CAAC,CACxE,MAAO,CAAC,CAAE,KAAM,SAAU,KAAM,EAAO,GAAI,QAAO,CAAE,EAAO,GAAG,OAAO,CAIvE,IAAM,EAAS,EAAE,MAAM,yCAAyC,CAChE,GAAI,IAAW,EAAO,GAAG,SAAS,IAAI,EAAI,EAAO,GAAG,SAAS,IAAI,EAAG,CAClE,IAAM,EAAQ,WAAW,EAAO,GAAG,QAAQ,KAAM,GAAG,CAAC,CACrD,MAAO,CAAC,CAAE,KAAM,SAAU,KAAM,EAAO,GAAI,QAAO,CAAE,EAAO,GAAG,OAAO,CAIvE,IAAM,EAAW,EAAE,MAAM,oBAAoB,CAC7C,GAAI,EAAU,CACZ,IAAM,EAAQ,SAAS,EAAS,GAAI,GAAG,CACvC,MAAO,CAAC,CAAE,KAAM,SAAU,KAAM,EAAS,GAAI,QAAO,CAAE,EAAS,GAAG,OAAO,CAI3E,MAAO,CAAC,CAAE,KAAM,UAAW,KAAM,EAAE,GAAI,CAAE,EAAE,CAM7C,SAAU,GAAc,EAA6B,CAEnD,GAAI,CAAC,EAAG,CACN,KAAM,CAAE,KAAM,UAAW,KAAM,KAAM,CACrC,OAIF,GAAI,cAAc,KAAK,EAAE,EAAI,EAAa,IAAI,EAAE,CAAE,CAEhD,GAAI,EAAgB,EAAE,CAAE,CACtB,KAAM,CAAE,KAAM,WAAY,KAAM,EAAG,CACnC,OAEF,KAAM,CAAE,KAAM,OAAQ,KAAM,EAAG,CAC/B,OAIF,GAAI,EAAE,WAAW,IAAI,EAAI,EAAE,QAAU,GAAI,CACvC,IAAM,EAAW,EAAc,KAAK,EAAE,CACtC,GAAI,EAAU,CACZ,IAAM,EAAK,EAAS,GACd,EAAS,EAAS,GAAK,EAAS,GAGtC,GAFA,KAAM,CAAE,KAAM,QAAS,KAAM,EAAS,GAAI,KAAI,SAAQ,CACtD,EAAI,EAAE,MAAM,EAAS,GAAG,OAAO,CAC3B,CAAC,EAAG,QAKZ,GAAI,EAAE,OAAS,GAAK,EAAY,IAAI,EAAE,GAAG,EAAI,EAAO,IAAI,EAAE,GAAG,CAAE,CAC7D,GAAM,CAAC,EAAO,GAAS,EAAY,EAAE,CAGrC,GAFA,MAAM,EACN,EAAI,EAAE,MAAM,EAAM,CACd,CAAC,EAAG,OAIV,GAAI,EAAE,OAAS,GAAKC,KAAkB,SAAS,EAAE,GAAG,EAAI,SAAS,KAAK,EAAE,GAAG,CAAE,CAC3E,IAAI,EAAI,EACR,KAAO,EAAI,EAAE,QAAU,SAAS,KAAK,EAAE,GAAG,EAAE,IAC5C,IAAM,EAAO,EAAE,MAAM,EAAG,EAAE,EAExB,EAAK,MAAM,EAAE,CAAC,aAAa,GAAK,EAAK,MAAM,EAAE,EAC5C,EAAI,GAAK,EAAK,MAAM,EAAE,CAAC,aAAa,GAAK,EAAK,MAAM,EAAE,IAEvD,KAAM,CAAE,KAAM,OAAQ,KAAM,EAAM,CAClC,EAAI,EAAE,MAAM,EAAE,EAKlB,GAAI,EAAE,QAAU,EAAG,CACjB,GAAI,EAAc,SAAS,EAAE,GAAG,EAAI,EAAc,SAAS,EAAE,EAAE,OAAS,GAAG,CAAE,CAC3E,IAAM,EAAQ,EAAE,MAAM,EAAG,GAAG,CAC5B,GAAI,cAAc,KAAK,EAAM,CAAE,CAC7B,MAAM,EAAM,EAAE,GAAI,IAAkB,CACpC,KAAM,CAAE,KAAM,OAAQ,KAAM,EAAO,CACnC,MAAM,EAAM,EAAE,EAAE,OAAS,GAAI,IAAmB,CAChD,QAGJ,GAAI,EAAc,SAAS,EAAE,GAAG,EAAI,EAAc,SAAS,EAAE,EAAE,OAAS,GAAG,CAAE,CAC3E,IAAM,EAAQ,EAAE,MAAM,EAAG,GAAG,CAC5B,GAAI,cAAc,KAAK,EAAM,CAAE,CAC7B,MAAM,EAAM,EAAE,GAAI,IAAkB,CACpC,KAAM,CAAE,KAAM,OAAQ,KAAM,EAAO,CACnC,MAAM,EAAM,EAAE,EAAE,OAAS,GAAI,IAAmB,CAChD,SAiBN,IAXI,EAAE,OAAS,IACT,EAAc,SAAS,EAAE,GAAG,EAC9B,MAAM,EAAM,EAAE,GAAI,IAAkB,CACpC,EAAI,EAAE,MAAM,EAAE,EACL,EAAc,SAAS,EAAE,GAAG,GACrC,MAAM,EAAM,EAAE,GAAI,IAAkB,CACpC,EAAI,EAAE,MAAM,EAAE,GAKX,GAAG,CAER,KAAO,GAAK,EAAgB,IAAI,EAAE,GAAG,EAAE,CAErC,GAAI,EAAE,WAAW,IAAI,CAAE,CACrB,IAAM,EAAe,EAAe,KAAK,EAAE,CAC3C,GAAI,EAAc,CAChB,KAAM,CAAE,KAAM,UAAW,KAAM,EAAa,GAAI,CAChD,EAAI,EAAE,MAAM,EAAa,GAAG,OAAO,CACnC,SAEF,IAAM,EAAc,EAAc,KAAK,EAAE,CACzC,GAAI,EAAa,CACf,KAAM,CAAE,KAAM,SAAU,KAAM,EAAY,GAAI,CAC9C,EAAI,EAAE,MAAM,EAAY,GAAG,OAAO,CAClC,SAEF,IAAM,EAAc,EAAc,KAAK,EAAE,CACzC,GAAI,EAAa,CACf,KAAM,CAAE,KAAM,SAAU,KAAM,EAAY,GAAI,CAC9C,EAAI,EAAE,MAAM,EAAY,GAAG,OAAO,CAClC,UAIJ,GAAI,EAAE,WAAW,QAAQ,CAAE,CACzB,MAAM,EAAM,QAAS,MAAM,CAC3B,EAAI,EAAE,MAAM,EAAE,CACd,SAEF,GAAI,EAAE,WAAW,MAAM,CAAE,CACvB,MAAM,EAAM,MAAM,CAClB,EAAI,EAAE,MAAM,EAAE,CACd,SAEF,GAAI,EAAE,WAAW,MAAM,CAAE,CACvB,IAAI,EAAO,MACP,EAAO,EAAE,MAAM,EAAE,CACrB,KAAO,EAAK,WAAW,IAAI,EACzB,GAAQ,IACR,EAAO,EAAK,MAAM,EAAE,CAEtB,MAAM,EAAM,EAAM,IAAI,CACtB,EAAI,EACJ,SAEF,GAAI,EAAE,WAAW,IAAI,CAAE,CACrB,MAAM,EAAM,IAAI,CAChB,EAAI,EAAE,MAAM,EAAE,CACd,SAGF,GAAI,IAAM,KAAM,CACd,MAAM,EAAM,KAAM,IAAI,CACtB,EAAI,GACJ,SAEF,GAAI,EAAE,WAAW,KAAK,CAAE,CACtB,MAAM,EAAM,KAAM,IAAkB,CACpC,EAAI,EAAE,MAAM,EAAE,CACd,SAGF,GAAI,IAAM,MAAQ,IAAM,KAAM,CAE5B,MAAM,EAAM,EAAE,CACd,EAAI,GACJ,SAGF,GAAI,EAAQ,SAAS,EAAE,GAAG,CAAE,CAC1B,MAAM,EAAM,EAAE,GAAI,IAAO,CACzB,EAAI,EAAE,MAAM,EAAE,CACd,SAGF,GAAI,EAAc,SAAS,EAAE,GAAG,CAAE,CAChC,MAAM,EAAM,EAAE,GAAI,IAAmB,CACrC,EAAI,EAAE,MAAM,EAAE,CACd,SAEF,GAAI,EAAc,SAAS,EAAE,GAAG,CAAE,CAChC,MAAM,EAAM,EAAE,GAAI,IAAmB,CACrC,EAAI,EAAE,MAAM,EAAE,CACd,SAGF,GAAI,EAAE,WAAW,IAAI,EAAI,EAAE,OAAS,EAAG,CACrC,IAAM,EAAY,EAAQ,KAAK,EAAE,CACjC,GAAI,EAAW,CAET,SAAS,KAAK,EAAU,GAAG,CAC7B,KAAM,CACJ,KAAM,UACN,KAAM,EAAU,GAChB,MAAO,SAAS,EAAU,GAAG,MAAM,EAAE,CAAE,GAAG,CAC3C,CAED,KAAM,CAAE,KAAM,UAAW,KAAM,EAAU,GAAI,CAE/C,EAAI,EAAE,MAAM,EAAU,GAAG,OAAO,CAChC,UAIJ,GAAI,EAAE,WAAW,IAAI,EAAI,EAAE,OAAS,EAAG,CACrC,IAAM,EAAY,EAAS,KAAK,EAAE,CAClC,GAAI,EAAW,CACb,KAAM,CAAE,KAAM,WAAY,KAAM,EAAU,GAAI,SAAU,EAAU,GAAG,MAAM,EAAE,CAAE,CAC/E,EAAI,EAAE,MAAM,EAAU,GAAG,OAAO,CAChC,UAIJ,GAAI,EAAE,WAAW,IAAI,EAAI,EAAE,OAAS,GAAK,EAAO,IAAI,EAAE,GAAG,CAAE,CACzD,IAAM,EAAW,EAAc,KAAK,EAAE,CACtC,GAAI,EAAU,CACZ,IAAM,EAAK,EAAS,GACd,EAAS,EAAS,GAAK,EAAS,GACtC,KAAM,CAAE,KAAM,QAAS,KAAM,EAAS,GAAI,KAAI,SAAQ,CACtD,EAAI,EAAE,MAAM,EAAS,GAAG,OAAO,CAC/B,UAIJ,MAAM,EAAM,EAAE,GAAG,CACjB,EAAI,EAAE,MAAM,EAAE,CAGhB,GAAI,CAAC,EAAG,MAGR,GAAI,EAAE,SAAS,IAAI,CAAE,CACnB,IAAM,EAAa,EAAM,KAAK,EAAE,CAChC,GAAI,EAAY,CACd,KAAM,CAAE,KAAM,QAAS,KAAM,EAAW,GAAI,CAC5C,EAAI,EAAE,MAAM,EAAW,GAAG,OAAO,CACjC,UAKJ,GAAI,EAAW,KAAK,EAAE,CAAE,CAEtB,IAAI,EAAM,EACN,EAAW,GACf,KAAO,GAAO,EAAsB,IAAI,EAAI,EAAI,OAAS,GAAG,EAC1D,EAAW,EAAI,EAAI,OAAS,GAAK,EACjC,EAAM,EAAI,MAAM,EAAG,GAAG,CAExB,KAAM,CAAE,KAAM,MAAO,KAAM,EAAK,CAChC,EAAI,EACJ,SAIF,GAAI,EAAE,QAAU,GAAK,eAAe,KAAK,EAAE,EAAI,EAAE,SAAS,IAAI,CAAE,CAC9D,IAAM,EAAc,EAAO,KAAK,EAAE,CAClC,GAAI,EAAa,CACf,IAAI,EAAS,EAAY,GACrB,EAAW,EAAE,MAAM,EAAO,OAAO,CAErC,KAAO,GAAU,EAAgB,IAAI,EAAO,EAAO,OAAS,GAAG,EAC7D,EAAW,EAAO,EAAO,OAAS,GAAK,EACvC,EAAS,EAAO,MAAM,EAAG,GAAG,CAE9B,GAAI,EAAO,SAAS,IAAI,CAAE,CACxB,KAAM,CAAE,KAAM,SAAU,KAAM,EAAQ,CACtC,EAAI,EACJ,WAMN,GAAI,EAAO,IAAI,EAAE,GAAG,EAAK,EAAY,IAAI,EAAE,GAAG,EAAI,EAAE,OAAS,GAAK,EAAO,IAAI,EAAE,GAAG,CAAG,CACnF,GAAM,CAAC,EAAO,GAAS,EAAY,EAAE,CAKrC,GAJA,MAAM,EACN,EAAI,EAAE,MAAM,EAAM,CAGd,EAAG,CACL,IAAM,EAAY,EAAe,KAAK,EAAE,CACpC,IACF,KAAM,CAAE,KAAM,OAAQ,KAAM,EAAU,GAAI,CAC1C,EAAI,EAAE,MAAM,EAAU,GAAG,OAAO,EAGpC,SAIF,GAAI,UAAU,KAAK,EAAE,CAAE,CACrB,IAAI,EAAI,EACF,EAAe,IAAI,IAAI,CAAC,IAAK,IAAK,IAAK,IAAK,IAAK,IAAQ,IAAS,CAAC,CACnE,EAAe,IAAI,IAAI,CAAC,IAAK,IAAK,IAAI,CAAC,CAE7C,KAAO,EAAI,EAAE,QACX,GAAI,SAAS,KAAK,EAAE,GAAG,CACrB,YACS,EAAO,IAAI,EAAE,GAAG,CAEzB,YACS,EAAa,IAAI,EAAE,GAAG,EAAI,EAAI,EAAI,EAAE,QAAU,SAAS,KAAK,EAAE,EAAI,GAAG,CAC9E,SAEA,MAIA,EAAI,EAAE,QAAU,EAAa,IAAI,EAAE,GAAG,EACxC,IAEF,IAAM,EAAgB,EAAE,MAAM,EAAG,EAAE,CAGnC,GAAI,EAAgB,EAAc,CAAE,CAClC,KAAM,CAAE,KAAM,WAAY,KAAM,EAAe,CAC/C,EAAI,EAAE,MAAM,EAAE,CACd,SAGF,KAAM,CAAE,KAAM,OAAQ,KAAM,EAAe,CAC3C,EAAI,EAAE,MAAM,EAAE,CACd,SAIF,KAAM,CAAE,KAAM,UAAW,KAAM,EAAE,GAAI,CACrC,EAAI,EAAE,MAAM,EAAE,EAOlB,SAAU,GAAoB,EAAc,EAAoD,CAI9F,IAAM,GAHW,EAAyB,EAAiB,EAAK,CAAG,GAGrC,MAAM,UAAU,CAC1C,EAAQ,GAEZ,IAAK,IAAM,KAAa,EAAY,CAC7B,IAEH,KAAM,IAER,EAAQ,GAGR,IAAK,IAAM,KAAQ,EAAU,MAAM,MAAM,CACnC,IACF,MAAM,IASd,SAAgB,GAAI,EAAc,EAAyB,GAAe,CACxE,IAAM,EAAkB,EAAE,CAE1B,IAAK,IAAM,KAAY,GAAoB,EAAM,EAAuB,CACtE,IAAK,IAAM,KAAS,GAAc,EAAS,CACzC,EAAO,KAAK,EAAM,CAItB,OAAO,EClqBT,MAAa,GAAwC,CAEnD,GAAI,QACJ,MAAO,QACP,IAAK,MACL,OAAQ,MACR,GAAI,OACJ,MAAO,OACP,GAAI,SACJ,MAAO,SACP,KAAM,YACN,QAAS,YAGT,GAAI,aACJ,MAAO,aACP,IAAK,kBACL,OAAQ,kBACR,IAAK,sBACL,OAAQ,sBACR,GAAI,iBACJ,MAAO,iBACP,IAAK,eACL,OAAQ,eACR,IAAK,sBACL,OAAQ,sBAGR,IAAK,YACL,QAAS,YACT,MAAO,mBACP,WAAY,mBACZ,IAAK,iBACL,SAAU,iBACV,GAAI,SACJ,OAAQ,SACR,IAAK,kBACL,WAAY,kBACZ,IAAK,WACL,OAAQ,WACR,IAAK,WACL,OAAQ,WACR,IAAK,kBACL,SAAU,kBACV,IAAK,aACL,OAAQ,aACR,GAAI,QACJ,MAAO,QACP,IAAK,YACL,OAAQ,YACR,MAAO,YACP,SAAU,YACV,GAAI,QACJ,MAAO,QACP,GAAI,eACJ,MAAO,eACP,GAAI,cACJ,OAAQ,cAGR,GAAI,UACJ,MAAO,UACP,GAAI,QACJ,MAAO,QAGP,GAAI,sBACJ,OAAQ,sBACR,GAAI,oBACJ,OAAQ,oBACR,GAAI,mBACJ,OAAQ,mBAER,OAAQ,iBACR,IAAK,uBACL,QAAS,uBACT,IAAK,oCACL,IAAK,oCAGL,IAAK,YACL,OAAQ,YACR,IAAK,UACL,OAAQ,UACR,GAAI,WACJ,MAAO,WAGP,EAAG,SACH,KAAM,SACN,EAAG,QACH,KAAM,QACN,EAAG,SACH,KAAM,SACN,EAAG,SACH,KAAM,SACN,GAAI,aACJ,OAAQ,aACR,GAAI,aACJ,OAAQ,aACR,GAAI,YACJ,OAAQ,YACR,GAAI,YACJ,OAAQ,YAGR,IAAK,SACL,OAAQ,SACR,MAAO,UACP,SAAU,UACV,KAAM,UACN,QAAS,UACT,GAAI,aACJ,MAAO,aACP,IAAK,aACL,OAAQ,aACT,CAKY,GAAyB,IAAI,IAAI,CAC5C,OACA,UACA,QACA,MACA,UACA,QACA,MACD,CAAC,CCxHF,SAAgB,GAAiB,EAA0B,CACzD,IAAM,EAAkB,EAAE,CACtB,EAAI,EAER,KAAO,EAAI,EAAO,QAAQ,CACxB,IAAM,EAAQ,EAAO,GACf,EAAO,EAAO,EAAI,GAGxB,GAAI,EAAM,OAAS,QAAU,GAAM,OAAS,eAAiB,EAAK,OAAS,IAAK,CAC9E,IAAM,EAAmB,EAAM,KAAO,IACtC,GAAI,KAAoB,IAAiB,KAAoB,EAAe,CAC1E,EAAO,KAAK,CAAE,KAAM,OAAQ,KAAM,EAAkB,CAAC,CACrD,GAAK,EACL,UAKJ,GAAI,EAAM,OAAS,eAAiB,EAAM,QAAQ,GAAoB,GAAM,OAAS,SAAU,CAC7F,IAAM,EAAM,EAAiB,EAAM,MACnC,EAAO,KAAK,CACV,KAAM,SACN,KAAM,EAAM,KAAO,EAAK,KACxB,MAAO,EAAK,MACZ,SAAU,EACX,CAAC,CACF,GAAK,EACL,SAIF,GAAI,EAAM,OAAS,UAAY,GAAM,OAAS,OAAQ,CACpD,IAAM,EAAe,EAAK,KAC1B,GAAI,EAAgB,IAAI,EAAa,CAAE,CACrC,EAAO,KAAK,CACV,KAAM,SACN,KAAM,EAAM,KAAO,IAAM,EAAK,KAC9B,MAAO,EAAM,MACb,SAAU,EACX,CAAC,CACF,GAAK,EACL,SAGF,GAAI,KAAgB,EAAe,CACjC,IAAM,EAAa,EAAc,GACjC,EAAO,KAAK,CACV,KAAM,SACN,KAAM,EAAM,KAAO,IAAM,EAAK,KAC9B,MAAO,EAAM,MAAQ,EACrB,SAAU,MACX,CAAC,CACF,GAAK,EACL,UAKJ,GACE,EAAM,OAAS,UACf,GAAM,OAAS,QACf,CAAC,UAAW,WAAY,WAAY,gBAAgB,CAAC,SAAS,EAAK,KAAK,aAAa,CAAC,CACtF,CACA,EAAO,KAAK,CACV,KAAM,UACN,KAAM,EAAM,KAAO,IAAM,EAAK,KAC9B,MAAO,EAAM,MACd,CAAC,CACF,GAAK,EACL,SAIF,IAAK,EAAM,OAAS,QAAU,EAAM,OAAS,YAAc,GAAM,OAAS,OAAQ,CAChF,EAAO,KAAK,CACV,KAAM,YACN,KAAM,EAAM,KAAO,IAAM,EAAK,KAC9B,KAAM,EAAM,KACZ,MAAO,EAAM,MACb,IAAK,EAAM,IACX,KAAM,EAAK,KACX,OAAQ,EAAK,OACb,OAAQ,EAAK,OACd,CAAC,CACF,GAAK,EACL,SAIF,EAAO,KAAK,EAAM,CAClB,IAGF,OAAO,EC3FT,SAAS,GAAqB,EAA2B,CAEvD,GAAI,EAAU,OAAS,SAAW,EAAU,OAAS,UACnD,MAAO,GAIT,GAAI,EAAU,OAAS,QAAU,EAAU,KAAK,OAAS,EAAG,CAC1D,IAAM,EAAY,EAAU,KAAK,GACjC,GAAI,IAAc,EAAU,aAAa,EAAI,IAAc,EAAU,aAAa,CAShF,MADA,EANc,EAAU,KAAK,aAAa,GAE7B,GAET,EAAe,EAAU,KAAK,EAE9B,EAAgB,IAAI,EAAU,KAAK,EAK3C,MAAO,GAMT,SAAgB,GAAmB,EAA0B,CAC3D,GAAI,EAAO,SAAW,EAAG,MAAO,EAAE,CAElC,IAAM,EAAkB,EAAE,CACtB,EAAa,GACb,EAAI,EAEF,OAA8B,CAAE,KAAM,UAAW,KAAM,KAAM,EAC7D,OAA4B,CAAE,KAAM,QAAS,KAAM,KAAM,EAE/D,KAAO,EAAI,EAAO,QAAQ,CACxB,IAAM,EAAQ,EAAO,GACf,EAAO,EAAO,EAAI,GAGxB,GAAI,EAAM,OAAS,UAAW,CAC5B,AAEE,KADA,EAAO,KAAK,GAAa,CAAC,CACb,IAGf,IACA,SAUF,GANA,AAEE,KADA,EAAO,KAAK,GAAe,CAAC,CACf,IAIX,EAAM,OAAS,eAAiB,EAAgB,IAAI,EAAM,WAAW,CAAE,CAEzE,GAAI,EAAM,aAAe,KAAO,GAAQ,CAAC,GAAqB,EAAK,CAAE,CACnE,EAAO,KAAK,EAAM,CAClB,IACA,SAIF,IAAI,EAAe,EAAM,KACrB,EAAI,EAAI,EACZ,KAAO,EAAI,EAAO,QAAQ,CACxB,IAAM,EAAU,EAAO,GAEvB,GADI,EAAQ,OAAS,eACjB,CAAC,EAAmB,IAAI,EAAQ,WAAW,CAAE,MACjD,GAAgB,EAAQ,KACxB,IAaF,IATI,EAAI,EAAI,GACV,EAAO,KAAK,CAAE,GAAG,EAAO,KAAM,EAAc,CAAC,CAC7C,EAAI,IAEJ,EAAO,KAAK,EAAM,CAClB,KAIK,EAAI,EAAO,QAAQ,CACxB,IAAM,EAAM,EAAO,GAEnB,GADI,EAAI,OAAS,eACb,CAAC,EAAmB,IAAI,EAAI,WAAW,CAAE,MAC7C,EAAO,KAAK,EAAI,CAChB,IAIF,EAAO,KAAK,GAAa,CAAC,CAC1B,EAAa,GACb,SAIF,EAAO,KAAK,EAAM,CAClB,IAQF,OAJI,GACF,EAAO,KAAK,GAAa,CAAC,CAGrB,EC3GT,MAAM,GAAoB,IAAI,IAAI,CAAC,IAAQ,IAAQ,CAAC,CAKpD,SAAS,GAAS,EAAc,EAAe,GAAsB,CAKnE,OAJI,EAAM,OAAS,QAEf,CAAC,GAAgB,EAAgB,IAAI,EAAM,KAAK,CAAS,KAEtD,EADO,EAAM,KAAK,aAAa,GACd,KAM1B,SAAS,GAAkB,EAAuB,CAChD,OAAO,EAAM,OAAS,eAAiB,GAAkB,IAAI,EAAM,KAAK,CAY1E,SAAS,GAAiB,EAAiB,EAA4C,CACrF,IAAM,EAAoB,EAAE,CACxB,EAAI,EAGR,KAAO,EAAI,EAAO,QAAQ,CACxB,IAAM,EAAO,EAAO,GACd,EAAS,EAAO,EAAI,GAG1B,GAAI,GAAM,OAAS,QAAU,CAAC,GAAU,CAAC,GAAkB,EAAO,CAChE,MAGF,EAAS,KAAK,EAAK,CACnB,EAAS,KAAK,EAAO,CACrB,GAAK,EAGL,IAAM,EAAa,EAAO,GACtB,GAAY,OAAS,eAAiB,EAAW,OAAS,MAC5D,EAAS,KAAK,EAAW,CACzB,KAKJ,GAAI,EAAS,SAAW,EACtB,OAAO,KAIT,IAAM,EAAc,EAAO,GAC3B,GACE,CAAC,GACD,EAAY,OAAS,QACpB,EAAY,KAAK,aAAa,GAAK,MAAQ,EAAY,KAAK,aAAa,GAAK,MAE/E,OAAO,KAIT,IAAM,EAAS,EAAO,EAAI,GAC1B,GAAI,CAAC,GAAU,EAAO,OAAS,OAC7B,OAAO,KAOT,IAAI,EADU,CAAC,GAAG,EAAU,EAAa,EAAO,CAC/B,IAAK,GAAM,EAAE,KAAK,CAAC,KAAK,IAAI,CAG7C,MAFA,GAAO,EAAK,QAAQ,MAAO,IAAI,CAAC,QAAQ,MAAO,IAAI,CAE5C,CAAC,CAAE,KAAM,OAAQ,OAAM,CAAE,EAAI,EAAE,CAMxC,SAAgB,EAAe,EAA0B,CACvD,IAAM,EAAkB,EAAE,CACtB,EAAI,EAER,KAAO,EAAI,EAAO,QAAQ,CACxB,IAAM,EAAQ,EAAO,GACf,EAAO,EAAO,EAAI,GAGlB,EAAW,GAAiB,EAAQ,EAAE,CAC5C,GAAI,EAAU,CACZ,EAAO,KAAK,EAAS,GAAG,CACxB,EAAI,EAAS,GACb,SAIF,GAAI,EAAM,OAAS,QAAU,GAAM,OAAS,eAAiB,EAAK,OAAS,IAAK,CAC9E,IAAM,EAAO,EAAM,KAAK,QAAQ,MAAO,GAAG,CAC1C,GAAI,GAAuB,IAAI,EAAK,CAAE,CAEpC,EAAO,KAAK,CAAE,KAAM,OAAQ,KAAM,EAAM,KAAO,IAAK,CAAC,CACrD,GAAK,EACL,UAKJ,IAAK,EAAM,OAAS,QAAU,EAAM,OAAS,WAAa,GAAM,OAAS,OAAQ,CAC/E,IAAM,GAAM,EAAM,KAAkB,EAAM,OACtC,EAAwB,KAM5B,GALI,EAAI,IAAI,EAAK,KAAK,CACpB,EAAS,CAAC,EACD,EAAG,IAAI,EAAK,KAAK,GAC1B,EAAS,GAEP,IAAW,KAAM,CACnB,IAAI,EAAO,EAAM,KAAO,IAAM,EAAK,KACnC,GAAK,EAED,EAAO,IAAI,OAAS,eAAiB,EAAO,GAAG,OAAS,MAC1D,GAAQ,IACR,KAEF,EAAO,KAAK,CAAE,KAAM,OAAQ,OAAM,MAAO,EAAQ,CAAC,CAClD,UAKJ,IAAK,EAAM,OAAS,WAAa,EAAM,OAAS,WAAa,GAAM,OAAS,OAAQ,CAClF,IAAM,EAAQ,GAAS,EAAM,GAAK,CAClC,GAAI,IAAU,KAAM,CAClB,IAAM,GAAM,EAAM,KAAqB,EAAM,OAC7C,EAAO,KAAK,CACV,KAAM,UACN,KAAM,EAAM,KAAO,IAAM,EAAK,KAC9B,KAAM,EACN,QACA,MACD,CAAC,CACF,GAAK,EACL,UAKJ,IACG,EAAM,OAAS,QAAU,EAAM,OAAS,YACzC,EAAM,OAAS,GACf,GAAM,OAAS,SACf,CACA,IAAM,EAAO,EAAK,MAClB,GAAI,GAAQ,MAAQ,GAAQ,KAAM,CAChC,EAAO,KAAK,CACV,KAAM,UACN,KAAM,EAAM,KAAO,IAAM,EAAK,KAC9B,OACA,MAAO,EAAM,MACb,IAAK,EAAM,IACZ,CAAC,CACF,GAAK,EACL,UAKJ,IACG,EAAM,OAAS,QAAU,EAAM,OAAS,YACzC,EAAM,OAAS,GACf,GAAM,OAAS,OACf,CACA,EAAO,KAAK,CACV,KAAM,UACN,KAAM,EAAM,KAAO,IAAM,EAAK,KAC9B,KAAM,EAAK,MACX,MAAO,EAAM,MACb,IAAK,EAAM,IACZ,CAAC,CACF,GAAK,EACL,SAIF,GACE,EAAM,OAAS,QACf,EAAc,IAAI,EAAM,KAAK,aAAa,CAAC,EAC3C,GAAM,OAAS,OACf,CACA,EAAO,KAAK,CACV,GAAG,EACH,KAAM,EAAM,KAAO,IAAM,EAAK,KAC/B,CAAC,CACF,GAAK,EACL,SAIF,GACE,EAAM,OAAS,QACf,EAAc,IAAI,EAAM,KAAK,aAAa,CAAC,EAC3C,GAAM,OAAS,OACf,CACA,IAAM,EAAY,GAAc,EAAK,KAAK,aAAa,EACvD,GAAI,EAAW,CACb,EAAO,KAAK,CACV,KAAM,OACN,KAAM,EAAM,KAAO,IAAM,EAAK,KAC9B,KAAM,EAAU,GAChB,OAAQ,EAAU,GAClB,OAAQ,EAAU,GACnB,CAAC,CACF,GAAK,EACL,UAKJ,EAAO,KAAK,EAAM,CAClB,IAGF,OAAO,ECnOT,SAAgB,EAAS,EAAc,EAA2B,EAAE,CAAW,CAC7E,GAAM,CAAE,yBAAyB,GAAM,yBAAyB,IAAU,EAItE,EAAS,GAAI,EAAM,EAAuB,CAgB9C,MAbA,GAAS,GAAiB,EAAO,CAGjC,EAAS,EAAe,EAAO,CAG/B,AAIE,EAJE,EACO,GAAmB,EAAO,CAG1B,EAAO,OAAQ,GAAM,EAAE,OAAS,UAAU,CAG9C,EC7BT,SAAgB,GAAmB,EAAwB,CACzD,IAAM,EAAS,EAAS,EAAM,CAAE,uBAAwB,GAAM,CAAC,CACzD,EAAsB,EAAE,CAC1B,EAA4B,EAAE,CAElC,IAAK,IAAM,KAAS,EACd,EAAM,OAAS,UACjB,EAAkB,EAAE,CACX,EAAM,OAAS,SACpB,EAAgB,OAAS,GAC3B,EAAU,KAAK,EAAW,EAAgB,CAAC,CAE7C,EAAkB,EAAE,EACX,EAAM,OAAS,MACxB,EAAgB,KAAK,GAAa,EAAM,CAAC,CAS7C,OAJI,EAAgB,OAAS,GAC3B,EAAU,KAAK,EAAW,EAAgB,CAAC,CAGtC,EAMT,SAAS,GAAa,EAAsB,CAI1C,OAHI,EAAM,OAAS,cACV,EAAM,WAER,EAAM,MAAQ,GAMvB,SAAS,EAAW,EAAyB,CAC3C,GAAI,EAAM,SAAW,EAAG,MAAO,GAE/B,IAAI,EAAS,EAAM,GAEnB,IAAK,IAAI,EAAI,EAAG,EAAI,EAAM,OAAQ,IAAK,CACrC,IAAM,EAAO,EAAM,EAAI,GACjB,EAAO,EAAM,GAGA,GAAe,EAAM,EAAK,CAE3C,GAAU,IAAM,EAEhB,GAAU,EAId,OAAO,EAMT,SAAS,GAAe,EAAc,EAAuB,CAC3D,GAAI,CAAC,GAAQ,CAAC,EAAM,MAAO,GAE3B,IAAM,EAAW,EAAK,EAAK,OAAS,GAC9B,EAAY,EAAK,GA6BvB,MAFA,EAvBqB,IAAI,IAAI,CAAC,IAAK,IAAK,IAAU,IAAU,IAAU,IAAI,CAAC,CAC1D,IAAI,EAAS,EAIT,IAAI,IAAI,CAC3B,IACA,IACA,IACA,IACA,IACA,IACA,IACA,IACA,IACA,IACA,IACA,IACA,IACD,CAAC,CACe,IAAI,EAAU,EAG3B,IAAa,KAAO,IAAc"}
package/package.json ADDED
@@ -0,0 +1,56 @@
1
+ {
2
+ "name": "tokenize-is",
3
+ "version": "0.1.0",
4
+ "description": "TypeScript tokenizer for Icelandic text",
5
+ "keywords": [
6
+ "icelandic",
7
+ "nlp",
8
+ "sentence-splitting",
9
+ "text-processing",
10
+ "tokenizer"
11
+ ],
12
+ "license": "MIT",
13
+ "author": "Jökull Sólberg",
14
+ "repository": {
15
+ "type": "git",
16
+ "url": "https://github.com/jokull/tokenize-ts"
17
+ },
18
+ "files": [
19
+ "dist"
20
+ ],
21
+ "type": "module",
22
+ "sideEffects": false,
23
+ "exports": {
24
+ ".": {
25
+ "types": "./dist/index.d.mts",
26
+ "import": "./dist/index.mjs"
27
+ },
28
+ "./package.json": "./package.json"
29
+ },
30
+ "scripts": {
31
+ "build": "tsdown",
32
+ "test": "vitest run",
33
+ "test:watch": "vitest",
34
+ "typecheck": "tsc --noEmit",
35
+ "lint": "oxlint --config .config/.oxlintrc.json --type-aware --type-check",
36
+ "lint:fix": "oxlint --config .config/.oxlintrc.json --type-aware --type-check --fix",
37
+ "format": "oxfmt --config .config/.oxfmtrc.json --write .",
38
+ "format:check": "oxfmt --config .config/.oxfmtrc.json --check .",
39
+ "check": "pnpm lint && pnpm format:check && pnpm typecheck",
40
+ "changeset": "changeset",
41
+ "version": "changeset version",
42
+ "release": "pnpm build && changeset publish",
43
+ "prepare": "lefthook install"
44
+ },
45
+ "devDependencies": {
46
+ "@changesets/cli": "^2.29.8",
47
+ "@types/node": "^25.0.10",
48
+ "lefthook": "^2.0.15",
49
+ "oxlint": "^1.41.0",
50
+ "oxlint-tsgolint": "^0.11.1",
51
+ "tsdown": "^0.20.1",
52
+ "typescript": "^5.9.3",
53
+ "vitest": "^4.0.18"
54
+ },
55
+ "packageManager": "pnpm@10.10.0"
56
+ }