@peaceroad/markdown-it-strong-ja 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -0
- package/package.json +8 -8
- package/src/token-compat.js +8 -6
- package/src/token-core.js +54 -35
- package/src/token-postprocess/guards.js +7 -5
- package/src/token-utils.js +68 -2
package/README.md
CHANGED
|
@@ -357,6 +357,83 @@ Supporting visuals:
|
|
|
357
357
|
- `aggressive`:
|
|
358
358
|
`<p>broken **tail <a href="https://x.test">aa<strong>aa</strong><em>Text</em><strong>and<em>More</em>bb</strong>bb</a> after</p>`
|
|
359
359
|
|
|
360
|
+
## Compatibility Notes
|
|
361
|
+
|
|
362
|
+
### `markdown-it-attrs` 5.x parity
|
|
363
|
+
|
|
364
|
+
When `markdown-it-attrs` is installed, strong-ja follows the token stream produced by that plugin and does not reinterpret where `{...}` attributes should be attached. This is intentional: strong-ja should not make attribute syntax mean something different from `markdown-it-attrs` alone.
|
|
365
|
+
|
|
366
|
+
One edge case to be aware of is a tight list item followed by an emphasized line:
|
|
367
|
+
|
|
368
|
+
```markdown
|
|
369
|
+
- e {.li-style}
|
|
370
|
+
*{.ul-style}*
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
With `markdown-it-attrs` 5.x, the first attribute block is consumed as a block-level attribute on the hidden `paragraph_open` inside the tight list. Because that paragraph token is hidden by markdown-it's tight-list rendering, the class is not visible in the final HTML. The second `{.ul-style}` is inside emphasis text, not a suffix after a closed inline token, so it remains literal text:
|
|
374
|
+
|
|
375
|
+
```html
|
|
376
|
+
<ul>
|
|
377
|
+
<li>e
|
|
378
|
+
<em>{.ul-style}</em></li>
|
|
379
|
+
</ul>
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
This output matches `markdown-it-attrs` alone. To attach attributes intentionally, use the syntax owned by `markdown-it-attrs`, for example:
|
|
383
|
+
|
|
384
|
+
```markdown
|
|
385
|
+
- e
|
|
386
|
+
{.ul-style}
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
```html
|
|
390
|
+
<ul class="ul-style">
|
|
391
|
+
<li>e</li>
|
|
392
|
+
</ul>
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
or attach inline attributes after the closing inline token:
|
|
396
|
+
|
|
397
|
+
```markdown
|
|
398
|
+
- e
|
|
399
|
+
*x*{.ul-style}
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
```html
|
|
403
|
+
<ul>
|
|
404
|
+
<li>e
|
|
405
|
+
<em class="ul-style">x</em></li>
|
|
406
|
+
</ul>
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
strong-ja keeps this as dependency parity rather than adding a local workaround.
|
|
410
|
+
|
|
411
|
+
### `markdown-it` 14.2 astral delimiter policy
|
|
412
|
+
|
|
413
|
+
`markdown-it` 14.2 recognizes astral characters (surrogate pairs) as full Unicode code points when scanning emphasis delimiters. strong-ja keeps `compatible` mode aligned with that upstream behavior.
|
|
414
|
+
|
|
415
|
+
In Japanese modes, strong-ja still only adds its own delimiter relaxation when Japanese/CJK context is present. Astral Han characters, such as CJK Extension B, are treated as CJK context:
|
|
416
|
+
|
|
417
|
+
```markdown
|
|
418
|
+
*𠀋?*abc*
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
```html
|
|
422
|
+
<p><em>𠀋?</em>abc*</p>
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
Emoji or symbol-only English contexts remain aligned with `markdown-it` and are not promoted just because they are astral characters:
|
|
426
|
+
|
|
427
|
+
```markdown
|
|
428
|
+
*😀?*abc*
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
```html
|
|
432
|
+
<p>*😀?<em>abc</em></p>
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
Symbols inside Japanese prose may still be emphasized by the existing Japanese-context rule, for example `**😀**です` can render as `<p><strong>😀</strong>です</p>`. Use `mode: 'compatible'` when exact `markdown-it` 14.2 delimiter behavior is required.
|
|
436
|
+
|
|
360
437
|
## Options
|
|
361
438
|
|
|
362
439
|
### `mode`
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@peaceroad/markdown-it-strong-ja",
|
|
3
3
|
"description": "Extends asterisk emphasis handling for Japanese text while keeping markdown-it behavior as close as practical.",
|
|
4
|
-
"version": "0.9.
|
|
4
|
+
"version": "0.9.2",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"files": [
|
|
@@ -34,16 +34,16 @@
|
|
|
34
34
|
"author": "peaceroad <peaceroad@gmail.com>",
|
|
35
35
|
"license": "MIT",
|
|
36
36
|
"dependencies": {
|
|
37
|
-
"markdown-it": "^14.
|
|
37
|
+
"markdown-it": "^14.2.0"
|
|
38
38
|
},
|
|
39
39
|
"devDependencies": {
|
|
40
|
-
"@peaceroad/markdown-it-cjk-breaks-mod": "^0.1.
|
|
41
|
-
"@peaceroad/markdown-it-hr-sandwiched-semantic-container": "^0.
|
|
42
|
-
"@peaceroad/markdown-it-renderer-image": "^0.
|
|
43
|
-
"@peaceroad/markdown-it-renderer-inline-text": "^0.8.
|
|
44
|
-
"markdown-it-attrs": "^
|
|
40
|
+
"@peaceroad/markdown-it-cjk-breaks-mod": "^0.1.11",
|
|
41
|
+
"@peaceroad/markdown-it-hr-sandwiched-semantic-container": "^0.12.0",
|
|
42
|
+
"@peaceroad/markdown-it-renderer-image": "^0.16.0",
|
|
43
|
+
"@peaceroad/markdown-it-renderer-inline-text": "^0.8.1",
|
|
44
|
+
"markdown-it-attrs": "^5.0.0",
|
|
45
45
|
"markdown-it-sub": "^2.0.0",
|
|
46
46
|
"markdown-it-sup": "^2.0.0",
|
|
47
|
-
"p7d-markdown-it-p-captions": "^0.
|
|
47
|
+
"p7d-markdown-it-p-captions": "^0.23.0"
|
|
48
48
|
}
|
|
49
49
|
}
|
package/src/token-compat.js
CHANGED
|
@@ -3,6 +3,8 @@ import {
|
|
|
3
3
|
REG_ATTRS,
|
|
4
4
|
isJapaneseChar,
|
|
5
5
|
isAsciiWordCode,
|
|
6
|
+
codePointAtSafe,
|
|
7
|
+
codePointBeforeSafe,
|
|
6
8
|
hasCjkBreaksRule,
|
|
7
9
|
isCjkBreaksRuleName,
|
|
8
10
|
getRuntimeOpt,
|
|
@@ -119,8 +121,8 @@ const registerTokenCompat = (md, baseOpt) => {
|
|
|
119
121
|
if (!prevToken || !nextToken) continue
|
|
120
122
|
if (prevToken.type !== 'text' || !prevToken.content) continue
|
|
121
123
|
if (nextToken.type !== 'text' || !nextToken.content) continue
|
|
122
|
-
const prevCharCode = prevToken.content
|
|
123
|
-
const nextCharCode = nextToken.content
|
|
124
|
+
const prevCharCode = codePointBeforeSafe(prevToken.content, prevToken.content.length, 0)
|
|
125
|
+
const nextCharCode = codePointAtSafe(nextToken.content, 0, 0)
|
|
124
126
|
const isAsciiWord = isAsciiWordCode(nextCharCode)
|
|
125
127
|
const shouldReplace = isAsciiWord &&
|
|
126
128
|
isJapaneseChar(prevCharCode) && !isJapaneseChar(nextCharCode)
|
|
@@ -138,8 +140,8 @@ const registerTokenCompat = (md, baseOpt) => {
|
|
|
138
140
|
for (let idx = 0; idx < child.content.length; idx++) {
|
|
139
141
|
const ch = child.content[idx]
|
|
140
142
|
if (ch === '\n') {
|
|
141
|
-
const prevCharCode =
|
|
142
|
-
const nextCharCode =
|
|
143
|
+
const prevCharCode = codePointBeforeSafe(child.content, idx, 0)
|
|
144
|
+
const nextCharCode = codePointAtSafe(child.content, idx + 1, 0)
|
|
143
145
|
const isAsciiWord = isAsciiWordCode(nextCharCode)
|
|
144
146
|
const shouldReplace = isAsciiWord &&
|
|
145
147
|
isJapaneseChar(prevCharCode) && !isJapaneseChar(nextCharCode)
|
|
@@ -187,7 +189,7 @@ const registerTokenCompat = (md, baseOpt) => {
|
|
|
187
189
|
if (!prevTextCharCode || !isJapaneseChar(prevTextCharCode)) continue
|
|
188
190
|
const next = children[j + 1]
|
|
189
191
|
if (!next || next.type !== 'text' || !next.content) continue
|
|
190
|
-
const nextCharCode = next.content
|
|
192
|
+
const nextCharCode = codePointAtSafe(next.content, 0, 0)
|
|
191
193
|
if (nextCharCode !== 0x7B) continue
|
|
192
194
|
child.type = 'softbreak'
|
|
193
195
|
child.tag = ''
|
|
@@ -196,7 +198,7 @@ const registerTokenCompat = (md, baseOpt) => {
|
|
|
196
198
|
child.info = ''
|
|
197
199
|
continue
|
|
198
200
|
}
|
|
199
|
-
prevTextCharCode = child.content
|
|
201
|
+
prevTextCharCode = codePointBeforeSafe(child.content, child.content.length, 0)
|
|
200
202
|
}
|
|
201
203
|
}
|
|
202
204
|
}
|
package/src/token-core.js
CHANGED
|
@@ -3,6 +3,10 @@ import Token from 'markdown-it/lib/token.mjs'
|
|
|
3
3
|
import {
|
|
4
4
|
CHAR_ASTERISK,
|
|
5
5
|
CHAR_NEWLINE,
|
|
6
|
+
codePointAtSafe,
|
|
7
|
+
codePointBeforeSafe,
|
|
8
|
+
codePointStartBefore,
|
|
9
|
+
codePointSize,
|
|
6
10
|
isJapaneseChar,
|
|
7
11
|
isAsciiWordCode,
|
|
8
12
|
isSoftSpaceCode,
|
|
@@ -280,24 +284,34 @@ const buildScanDelimsLookupCache = (src) => {
|
|
|
280
284
|
|
|
281
285
|
let prev = -1
|
|
282
286
|
for (let i = 0; i < len; i++) {
|
|
283
|
-
const code = src
|
|
287
|
+
const code = codePointAtSafe(src, i)
|
|
284
288
|
if (code === CHAR_NEWLINE) {
|
|
285
289
|
prev = -1
|
|
286
290
|
continue
|
|
287
291
|
}
|
|
292
|
+
const size = codePointSize(code)
|
|
288
293
|
if (!isSoftSpaceCode(code)) prev = i
|
|
289
294
|
prevNonSpaceSameLine[i] = prev
|
|
295
|
+
if (size === 2 && i + 1 < len) {
|
|
296
|
+
prevNonSpaceSameLine[i + 1] = prev
|
|
297
|
+
i++
|
|
298
|
+
}
|
|
290
299
|
}
|
|
291
300
|
|
|
292
301
|
let next = -1
|
|
293
302
|
for (let i = len - 1; i >= 0; i--) {
|
|
294
|
-
const
|
|
303
|
+
const cpStart = codePointStartBefore(src, i + 1)
|
|
304
|
+
const code = cpStart === -1 ? 0 : codePointAtSafe(src, cpStart)
|
|
295
305
|
if (code === CHAR_NEWLINE) {
|
|
296
306
|
next = -1
|
|
297
307
|
continue
|
|
298
308
|
}
|
|
299
|
-
if (!isSoftSpaceCode(code)) next =
|
|
309
|
+
if (!isSoftSpaceCode(code)) next = cpStart
|
|
300
310
|
nextNonSpaceSameLine[i] = next
|
|
311
|
+
if (cpStart !== i) {
|
|
312
|
+
nextNonSpaceSameLine[cpStart] = next
|
|
313
|
+
i = cpStart
|
|
314
|
+
}
|
|
301
315
|
}
|
|
302
316
|
|
|
303
317
|
return {
|
|
@@ -323,11 +337,13 @@ const findPrevNonSpaceIndex = (src, start, lookupCache = null) => {
|
|
|
323
337
|
start < lookupCache.prevNonSpaceSameLine.length) {
|
|
324
338
|
return lookupCache.prevNonSpaceSameLine[start]
|
|
325
339
|
}
|
|
326
|
-
for (let i = start; i >= 0;
|
|
327
|
-
const
|
|
340
|
+
for (let i = start; i >= 0;) {
|
|
341
|
+
const cpStart = codePointStartBefore(src, i + 1)
|
|
342
|
+
if (cpStart === -1) return -1
|
|
343
|
+
const code = codePointAtSafe(src, cpStart)
|
|
328
344
|
if (code === CHAR_NEWLINE) return -1
|
|
329
|
-
if (isSoftSpaceCode(code))
|
|
330
|
-
|
|
345
|
+
if (!isSoftSpaceCode(code)) return cpStart
|
|
346
|
+
i = cpStart - 1
|
|
331
347
|
}
|
|
332
348
|
return -1
|
|
333
349
|
}
|
|
@@ -340,11 +356,11 @@ const findNextNonSpaceIndex = (src, start, max, lookupCache = null) => {
|
|
|
340
356
|
const next = lookupCache.nextNonSpaceSameLine[start]
|
|
341
357
|
return next !== -1 && next < max ? next : -1
|
|
342
358
|
}
|
|
343
|
-
for (let i = start; i < max;
|
|
344
|
-
const code = src
|
|
359
|
+
for (let i = start; i < max;) {
|
|
360
|
+
const code = codePointAtSafe(src, i)
|
|
345
361
|
if (code === CHAR_NEWLINE) return -1
|
|
346
|
-
if (isSoftSpaceCode(code))
|
|
347
|
-
|
|
362
|
+
if (!isSoftSpaceCode(code)) return i
|
|
363
|
+
i += codePointSize(code)
|
|
348
364
|
}
|
|
349
365
|
return -1
|
|
350
366
|
}
|
|
@@ -353,26 +369,26 @@ const hasAsciiStartAfterOptionalOpenWrappers = (src, index, max, lookupCache = n
|
|
|
353
369
|
let i = index
|
|
354
370
|
// Two wrappers are enough for common shapes: * [ "word" ]*
|
|
355
371
|
for (let wrappers = 0; wrappers < 2 && i >= 0 && i < max; wrappers++) {
|
|
356
|
-
const code = src
|
|
372
|
+
const code = codePointAtSafe(src, i)
|
|
357
373
|
if (!isAsciiGuardOpenWrapper(code)) break
|
|
358
374
|
i = findNextNonSpaceIndex(src, i + 1, max, lookupCache)
|
|
359
375
|
if (i === -1) return false
|
|
360
376
|
}
|
|
361
377
|
if (i < 0 || i >= max) return false
|
|
362
|
-
return isAsciiWordCode(src
|
|
378
|
+
return isAsciiWordCode(codePointAtSafe(src, i))
|
|
363
379
|
}
|
|
364
380
|
|
|
365
381
|
const hasAsciiEndBeforeOptionalCloseWrappers = (src, index, lookupCache = null) => {
|
|
366
382
|
let i = index
|
|
367
383
|
// Two wrappers are enough for common shapes: *["word"] *
|
|
368
384
|
for (let wrappers = 0; wrappers < 2 && i >= 0; wrappers++) {
|
|
369
|
-
const code = src
|
|
385
|
+
const code = codePointAtSafe(src, i)
|
|
370
386
|
if (!isAsciiGuardCloseWrapper(code)) break
|
|
371
387
|
i = findPrevNonSpaceIndex(src, i - 1, lookupCache)
|
|
372
388
|
if (i === -1) return false
|
|
373
389
|
}
|
|
374
390
|
if (i < 0) return false
|
|
375
|
-
return isAsciiWordCode(src
|
|
391
|
+
return isAsciiWordCode(codePointAtSafe(src, i))
|
|
376
392
|
}
|
|
377
393
|
|
|
378
394
|
const isMarkdownStructuralOpenWrapper = (code) => {
|
|
@@ -409,18 +425,20 @@ const findPrevNonSpaceLimited = (src, start, maxLook, lookupCache = null) => {
|
|
|
409
425
|
start < lookupCache.prevNonSpaceSameLine.length) {
|
|
410
426
|
const prev = lookupCache.prevNonSpaceSameLine[start]
|
|
411
427
|
if (prev !== -1 && (start - prev) < maxLook) {
|
|
412
|
-
return src
|
|
428
|
+
return codePointAtSafe(src, prev)
|
|
413
429
|
}
|
|
414
430
|
return 0
|
|
415
431
|
}
|
|
416
432
|
let looked = 0
|
|
417
|
-
for (let i = start; i >= 0;
|
|
433
|
+
for (let i = start; i >= 0;) {
|
|
418
434
|
if (looked >= maxLook) break
|
|
419
|
-
const
|
|
420
|
-
|
|
435
|
+
const cpStart = codePointStartBefore(src, i + 1)
|
|
436
|
+
if (cpStart === -1) break
|
|
437
|
+
const code = codePointAtSafe(src, cpStart)
|
|
438
|
+
looked += i - cpStart + 1
|
|
421
439
|
if (code === CHAR_NEWLINE) return 0
|
|
422
|
-
if (isSoftSpaceCode(code))
|
|
423
|
-
|
|
440
|
+
if (!isSoftSpaceCode(code)) return code
|
|
441
|
+
i = cpStart - 1
|
|
424
442
|
}
|
|
425
443
|
return 0
|
|
426
444
|
}
|
|
@@ -432,18 +450,19 @@ const findNextNonSpaceLimited = (src, start, max, maxLook, lookupCache = null) =
|
|
|
432
450
|
start < lookupCache.nextNonSpaceSameLine.length) {
|
|
433
451
|
const next = lookupCache.nextNonSpaceSameLine[start]
|
|
434
452
|
if (next !== -1 && next < max && (next - start) < maxLook) {
|
|
435
|
-
return src
|
|
453
|
+
return codePointAtSafe(src, next)
|
|
436
454
|
}
|
|
437
455
|
return 0
|
|
438
456
|
}
|
|
439
457
|
let looked = 0
|
|
440
|
-
for (let i = start; i < max;
|
|
458
|
+
for (let i = start; i < max;) {
|
|
441
459
|
if (looked >= maxLook) break
|
|
442
|
-
const code = src
|
|
443
|
-
|
|
460
|
+
const code = codePointAtSafe(src, i)
|
|
461
|
+
const size = codePointSize(code)
|
|
462
|
+
looked += size
|
|
444
463
|
if (code === CHAR_NEWLINE) return 0
|
|
445
|
-
if (isSoftSpaceCode(code))
|
|
446
|
-
|
|
464
|
+
if (!isSoftSpaceCode(code)) return code
|
|
465
|
+
i += size
|
|
447
466
|
}
|
|
448
467
|
return 0
|
|
449
468
|
}
|
|
@@ -462,8 +481,8 @@ const hasJapaneseContextForBracketWrapper = (src, start, pos, max, lastChar, nex
|
|
|
462
481
|
|
|
463
482
|
const scanPrevSingleStarContextFlags = (src, start) => {
|
|
464
483
|
let hasJapaneseBetween = false
|
|
465
|
-
for (let i = start
|
|
466
|
-
const code = src
|
|
484
|
+
for (let i = codePointStartBefore(src, start); i >= 0; i = codePointStartBefore(src, i)) {
|
|
485
|
+
const code = codePointAtSafe(src, i)
|
|
467
486
|
if (code === CHAR_NEWLINE) break
|
|
468
487
|
if (isSentenceBoundaryStop(code) && i < start - 1) break
|
|
469
488
|
if (code !== CHAR_ASTERISK) {
|
|
@@ -475,8 +494,8 @@ const scanPrevSingleStarContextFlags = (src, start) => {
|
|
|
475
494
|
backslashCount++
|
|
476
495
|
}
|
|
477
496
|
if ((backslashCount % 2) === 1) continue
|
|
478
|
-
const prevCode = i
|
|
479
|
-
const nextCode =
|
|
497
|
+
const prevCode = codePointBeforeSafe(src, i, 0)
|
|
498
|
+
const nextCode = codePointAtSafe(src, i + 1, 0)
|
|
480
499
|
if (prevCode === CHAR_ASTERISK || nextCode === CHAR_ASTERISK) continue
|
|
481
500
|
return hasJapaneseBetween ? PREV_STAR_HAS_OPENER | PREV_STAR_HAS_JP_BETWEEN : PREV_STAR_HAS_OPENER
|
|
482
501
|
}
|
|
@@ -778,12 +797,12 @@ const patchScanDelims = (md) => {
|
|
|
778
797
|
const aggressiveMode = (modeFlags & MODE_FLAG_AGGRESSIVE) !== 0
|
|
779
798
|
const max = this.posMax
|
|
780
799
|
let lookupCache = null
|
|
781
|
-
const lastChar =
|
|
800
|
+
const lastChar = codePointBeforeSafe(src, start, 0x20)
|
|
782
801
|
|
|
783
802
|
const count = base && base.length ? base.length : 1
|
|
784
803
|
const pos = start + count
|
|
785
804
|
|
|
786
|
-
const nextChar =
|
|
805
|
+
const nextChar = codePointAtSafe(src, pos, 0x20)
|
|
787
806
|
let prevStarFlags = -1
|
|
788
807
|
|
|
789
808
|
const leftJapanese = isJapaneseChar(lastChar)
|
|
@@ -819,7 +838,7 @@ const patchScanDelims = (md) => {
|
|
|
819
838
|
lookupCache || (lookupCache = getScanDelimsLookupCache(this))
|
|
820
839
|
)
|
|
821
840
|
if (prevNonSpaceIdx !== -1) {
|
|
822
|
-
const prevNonSpaceLocal = src
|
|
841
|
+
const prevNonSpaceLocal = codePointAtSafe(src, prevNonSpaceIdx)
|
|
823
842
|
const plusStrictAsciiBoundary = plusMode &&
|
|
824
843
|
hasAsciiEndBeforeOptionalCloseWrappers(src, prevNonSpaceIdx, lookupCache)
|
|
825
844
|
if (prevNonSpaceLocal !== CHAR_ASTERISK && !plusStrictAsciiBoundary) {
|
|
@@ -835,7 +854,7 @@ const patchScanDelims = (md) => {
|
|
|
835
854
|
lookupCache || (lookupCache = getScanDelimsLookupCache(this))
|
|
836
855
|
)
|
|
837
856
|
if (nextNonSpaceIdx !== -1) {
|
|
838
|
-
const nextNonSpace = src
|
|
857
|
+
const nextNonSpace = codePointAtSafe(src, nextNonSpaceIdx)
|
|
839
858
|
const plusStrictAsciiBoundary = plusMode &&
|
|
840
859
|
hasAsciiStartAfterOptionalOpenWrappers(src, nextNonSpaceIdx, max, lookupCache)
|
|
841
860
|
if (nextNonSpace !== CHAR_ASTERISK && !plusStrictAsciiBoundary) {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { isJapaneseChar } from '../token-utils.js'
|
|
1
|
+
import { codePointAtSafe, codePointBeforeSafe, codePointSize, isJapaneseChar } from '../token-utils.js'
|
|
2
2
|
|
|
3
3
|
const CHAR_ASTERISK = 0x2A // *
|
|
4
4
|
const INLINE_REPAIR_EM_OUTER_STRONG_SEQUENCE = 1 << 0
|
|
@@ -46,11 +46,13 @@ const tokenHasJapaneseChars = (token) => {
|
|
|
46
46
|
return token.__strongJaHasJapaneseChar
|
|
47
47
|
}
|
|
48
48
|
let hasJapanese = false
|
|
49
|
-
for (let i = 0; i < content.length;
|
|
50
|
-
|
|
49
|
+
for (let i = 0; i < content.length;) {
|
|
50
|
+
const code = codePointAtSafe(content, i)
|
|
51
|
+
if (isJapaneseChar(code)) {
|
|
51
52
|
hasJapanese = true
|
|
52
53
|
break
|
|
53
54
|
}
|
|
55
|
+
i += codePointSize(code)
|
|
54
56
|
}
|
|
55
57
|
token.__strongJaJapaneseSource = content
|
|
56
58
|
token.__strongJaHasJapaneseChar = hasJapanese
|
|
@@ -103,9 +105,9 @@ const countDelimiterLikeStrongRuns = (content, from = 0, limit = 0) => {
|
|
|
103
105
|
continue
|
|
104
106
|
}
|
|
105
107
|
const pos = at
|
|
106
|
-
const prevCode = pos
|
|
108
|
+
const prevCode = codePointBeforeSafe(content, pos, 0)
|
|
107
109
|
const nextPos = pos + 2
|
|
108
|
-
const nextCode =
|
|
110
|
+
const nextCode = codePointAtSafe(content, nextPos, 0)
|
|
109
111
|
const prevSameMarker = prevCode === CHAR_ASTERISK
|
|
110
112
|
const nextSameMarker = nextCode === CHAR_ASTERISK
|
|
111
113
|
if (prevSameMarker || nextSameMarker) {
|
package/src/token-utils.js
CHANGED
|
@@ -18,10 +18,69 @@ const VALID_CANONICAL_MODES = new Set([
|
|
|
18
18
|
])
|
|
19
19
|
const REG_JAPANESE = /[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}\u3000-\u303F\uFF00-\uFFEF]/u
|
|
20
20
|
const REG_ATTRS = /{[^{}\n!@#%^&*()]+?}$/
|
|
21
|
+
const CHAR_REPLACEMENT = 0xFFFD
|
|
22
|
+
|
|
23
|
+
const isHighSurrogate = (code) => code >= 0xD800 && code <= 0xDBFF
|
|
24
|
+
const isLowSurrogate = (code) => code >= 0xDC00 && code <= 0xDFFF
|
|
25
|
+
|
|
26
|
+
const combineSurrogates = (high, low) => {
|
|
27
|
+
return 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const codePointAtSafe = (src, index, fallback = 0) => {
|
|
31
|
+
if (typeof src !== 'string' || index < 0 || index >= src.length) return fallback
|
|
32
|
+
const first = src.charCodeAt(index)
|
|
33
|
+
if (first < 0xD800 || first > 0xDFFF) return first
|
|
34
|
+
if (first <= 0xDBFF) {
|
|
35
|
+
const second = index + 1 < src.length ? src.charCodeAt(index + 1) : 0
|
|
36
|
+
return isLowSurrogate(second) ? combineSurrogates(first, second) : CHAR_REPLACEMENT
|
|
37
|
+
}
|
|
38
|
+
return CHAR_REPLACEMENT
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const codePointBeforeSafe = (src, index, fallback = 0) => {
|
|
42
|
+
if (typeof src !== 'string' || index <= 0 || index > src.length) return fallback
|
|
43
|
+
const last = src.charCodeAt(index - 1)
|
|
44
|
+
if (last < 0xD800 || last > 0xDFFF) return last
|
|
45
|
+
if (last >= 0xDC00) {
|
|
46
|
+
const first = index - 2 >= 0 ? src.charCodeAt(index - 2) : 0
|
|
47
|
+
return isHighSurrogate(first) ? combineSurrogates(first, last) : CHAR_REPLACEMENT
|
|
48
|
+
}
|
|
49
|
+
return CHAR_REPLACEMENT
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const codePointStartBefore = (src, index) => {
|
|
53
|
+
if (typeof src !== 'string' || index <= 0 || index > src.length) return -1
|
|
54
|
+
const lastIdx = index - 1
|
|
55
|
+
const last = src.charCodeAt(lastIdx)
|
|
56
|
+
if (isLowSurrogate(last) && lastIdx - 1 >= 0 && isHighSurrogate(src.charCodeAt(lastIdx - 1))) {
|
|
57
|
+
return lastIdx - 1
|
|
58
|
+
}
|
|
59
|
+
return lastIdx
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const codePointSize = (code) => code > 0xFFFF ? 2 : 1
|
|
63
|
+
|
|
64
|
+
const isAstralJapaneseCode = (code) => {
|
|
65
|
+
return (code >= 0x1AFF0 && code <= 0x1AFFF) || // Kana Extended-B
|
|
66
|
+
(code >= 0x1B000 && code <= 0x1B0FF) || // Kana Supplement
|
|
67
|
+
(code >= 0x1B100 && code <= 0x1B12F) || // Kana Extended-A
|
|
68
|
+
(code >= 0x1B130 && code <= 0x1B16F) || // Small Kana Extension
|
|
69
|
+
(code >= 0x20000 && code <= 0x2A6DF) || // CJK Unified Ideographs Extension B
|
|
70
|
+
(code >= 0x2A700 && code <= 0x2B73F) || // Extension C
|
|
71
|
+
(code >= 0x2B740 && code <= 0x2B81F) || // Extension D
|
|
72
|
+
(code >= 0x2B820 && code <= 0x2CEAF) || // Extension E
|
|
73
|
+
(code >= 0x2CEB0 && code <= 0x2EBEF) || // Extension F
|
|
74
|
+
(code >= 0x2EBF0 && code <= 0x2EE5F) || // Extension I
|
|
75
|
+
(code >= 0x2F800 && code <= 0x2FA1F) || // CJK Compatibility Ideographs Supplement
|
|
76
|
+
(code >= 0x30000 && code <= 0x3134F) || // Extension G
|
|
77
|
+
(code >= 0x31350 && code <= 0x323AF) // Extension H
|
|
78
|
+
}
|
|
21
79
|
|
|
22
80
|
const isJapaneseChar = (ch) => {
|
|
23
81
|
if (!ch) return false
|
|
24
|
-
const code = typeof ch === 'string' ? ch.
|
|
82
|
+
const code = typeof ch === 'string' ? ch.codePointAt(0) : ch
|
|
83
|
+
if (!Number.isFinite(code)) return false
|
|
25
84
|
if (code < 128) return false
|
|
26
85
|
if (code >= 0x3040 && code <= 0x309F) return true
|
|
27
86
|
if (code >= 0x30A0 && code <= 0x30FF) return true
|
|
@@ -32,7 +91,10 @@ const isJapaneseChar = (ch) => {
|
|
|
32
91
|
if (code >= 0xF900 && code <= 0xFAFF) return true
|
|
33
92
|
if (code >= 0x3000 && code <= 0x303F) return true
|
|
34
93
|
if (code >= 0xFF00 && code <= 0xFFEF) return true
|
|
35
|
-
|
|
94
|
+
if (code > 0x10FFFF) return false
|
|
95
|
+
if (code >= 0x10000 && isAstralJapaneseCode(code)) return true
|
|
96
|
+
if (code >= 0x10000 && code < 0x20000) return false
|
|
97
|
+
return REG_JAPANESE.test(String.fromCodePoint(code))
|
|
36
98
|
}
|
|
37
99
|
|
|
38
100
|
const isAsciiWordCode = (code) => {
|
|
@@ -257,6 +319,10 @@ export {
|
|
|
257
319
|
CHAR_NEWLINE,
|
|
258
320
|
CHAR_IDEOGRAPHIC_SPACE,
|
|
259
321
|
REG_ATTRS,
|
|
322
|
+
codePointAtSafe,
|
|
323
|
+
codePointBeforeSafe,
|
|
324
|
+
codePointStartBefore,
|
|
325
|
+
codePointSize,
|
|
260
326
|
isJapaneseChar,
|
|
261
327
|
isAsciiWordCode,
|
|
262
328
|
isSoftSpaceCode,
|