npm - @peaceroad/markdown-it-strong-ja - Versions diffs - 0.9.1 → 0.9.2 - Mend

@peaceroad/markdown-it-strong-ja 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +77 -0
package/package.json +8 -8
package/src/token-compat.js +8 -6
package/src/token-core.js +54 -35
package/src/token-postprocess/guards.js +7 -5
package/src/token-utils.js +68 -2

package/README.md CHANGED Viewed

@@ -357,6 +357,83 @@ Supporting visuals:
 - `aggressive`:
   `<p>broken **tail <a href="https://x.test">aa<strong>aa</strong><em>Text</em><strong>and<em>More</em>bb</strong>bb</a> after</p>`
+## Compatibility Notes
+### `markdown-it-attrs` 5.x parity
+When `markdown-it-attrs` is installed, strong-ja follows the token stream produced by that plugin and does not reinterpret where `{...}` attributes should be attached. This is intentional: strong-ja should not make attribute syntax mean something different from `markdown-it-attrs` alone.
+One edge case to be aware of is a tight list item followed by an emphasized line:
+```markdown
+- e {.li-style}
+*{.ul-style}*
+```
+With `markdown-it-attrs` 5.x, the first attribute block is consumed as a block-level attribute on the hidden `paragraph_open` inside the tight list. Because that paragraph token is hidden by markdown-it's tight-list rendering, the class is not visible in the final HTML. The second `{.ul-style}` is inside emphasis text, not a suffix after a closed inline token, so it remains literal text:
+```html
+<ul>
+<li>e
+<em>{.ul-style}</em></li>
+</ul>
+```
+This output matches `markdown-it-attrs` alone. To attach attributes intentionally, use the syntax owned by `markdown-it-attrs`, for example:
+```markdown
+- e
+{.ul-style}
+```
+```html
+<ul class="ul-style">
+<li>e</li>
+</ul>
+```
+or attach inline attributes after the closing inline token:
+```markdown
+- e
+*x*{.ul-style}
+```
+```html
+<ul>
+<li>e
+<em class="ul-style">x</em></li>
+</ul>
+```
+strong-ja keeps this as dependency parity rather than adding a local workaround.
+### `markdown-it` 14.2 astral delimiter policy
+`markdown-it` 14.2 recognizes astral characters (surrogate pairs) as full Unicode code points when scanning emphasis delimiters. strong-ja keeps `compatible` mode aligned with that upstream behavior.
+In Japanese modes, strong-ja still only adds its own delimiter relaxation when Japanese/CJK context is present. Astral Han characters, such as CJK Extension B, are treated as CJK context:
+```markdown
+*𠀋?*abc*
+```
+```html
+<p><em>𠀋?</em>abc*</p>
+```
+Emoji or symbol-only English contexts remain aligned with `markdown-it` and are not promoted just because they are astral characters:
+```markdown
+*😀?*abc*
+```
+```html
+<p>*😀?<em>abc</em></p>
+```
+Symbols inside Japanese prose may still be emphasized by the existing Japanese-context rule, for example `**😀**です` can render as `<p><strong>😀</strong>です</p>`. Use `mode: 'compatible'` when exact `markdown-it` 14.2 delimiter behavior is required.
 ## Options
 ### `mode`

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@peaceroad/markdown-it-strong-ja",
   "description": "Extends asterisk emphasis handling for Japanese text while keeping markdown-it behavior as close as practical.",
-  "version": "0.9.1",
+  "version": "0.9.2",
   "main": "index.js",
   "type": "module",
   "files": [
@@ -34,16 +34,16 @@
   "author": "peaceroad <peaceroad@gmail.com>",
   "license": "MIT",
   "dependencies": {
-    "markdown-it": "^14.1.0"
+    "markdown-it": "^14.2.0"
   },
   "devDependencies": {
-    "@peaceroad/markdown-it-cjk-breaks-mod": "^0.1.10",
-    "@peaceroad/markdown-it-hr-sandwiched-semantic-container": "^0.11.0",
-    "@peaceroad/markdown-it-renderer-image": "^0.12.0",
-    "@peaceroad/markdown-it-renderer-inline-text": "^0.8.0",
-    "markdown-it-attrs": "^4.3.1",
+    "@peaceroad/markdown-it-cjk-breaks-mod": "^0.1.11",
+    "@peaceroad/markdown-it-hr-sandwiched-semantic-container": "^0.12.0",
+    "@peaceroad/markdown-it-renderer-image": "^0.16.0",
+    "@peaceroad/markdown-it-renderer-inline-text": "^0.8.1",
+    "markdown-it-attrs": "^5.0.0",
     "markdown-it-sub": "^2.0.0",
     "markdown-it-sup": "^2.0.0",
-    "p7d-markdown-it-p-captions": "^0.21.0"
+    "p7d-markdown-it-p-captions": "^0.23.0"
   }
 }

package/src/token-compat.js CHANGED Viewed

@@ -3,6 +3,8 @@ import {
   REG_ATTRS,
   isJapaneseChar,
   isAsciiWordCode,
+  codePointAtSafe,
+  codePointBeforeSafe,
   hasCjkBreaksRule,
   isCjkBreaksRuleName,
   getRuntimeOpt,
@@ -119,8 +121,8 @@ const registerTokenCompat = (md, baseOpt) => {
             if (!prevToken || !nextToken) continue
             if (prevToken.type !== 'text' || !prevToken.content) continue
             if (nextToken.type !== 'text' || !nextToken.content) continue
-            const prevCharCode = prevToken.content.charCodeAt(prevToken.content.length - 1)
-            const nextCharCode = nextToken.content.charCodeAt(0)
+            const prevCharCode = codePointBeforeSafe(prevToken.content, prevToken.content.length, 0)
+            const nextCharCode = codePointAtSafe(nextToken.content, 0, 0)
             const isAsciiWord = isAsciiWordCode(nextCharCode)
             const shouldReplace = isAsciiWord &&
               isJapaneseChar(prevCharCode) && !isJapaneseChar(nextCharCode)
@@ -138,8 +140,8 @@ const registerTokenCompat = (md, baseOpt) => {
           for (let idx = 0; idx < child.content.length; idx++) {
             const ch = child.content[idx]
             if (ch === '\n') {
-              const prevCharCode = idx > 0 ? child.content.charCodeAt(idx - 1) : 0
-              const nextCharCode = idx + 1 < child.content.length ? child.content.charCodeAt(idx + 1) : 0
+              const prevCharCode = codePointBeforeSafe(child.content, idx, 0)
+              const nextCharCode = codePointAtSafe(child.content, idx + 1, 0)
               const isAsciiWord = isAsciiWordCode(nextCharCode)
               const shouldReplace = isAsciiWord &&
                 isJapaneseChar(prevCharCode) && !isJapaneseChar(nextCharCode)
@@ -187,7 +189,7 @@ const registerTokenCompat = (md, baseOpt) => {
             if (!prevTextCharCode || !isJapaneseChar(prevTextCharCode)) continue
             const next = children[j + 1]
             if (!next || next.type !== 'text' || !next.content) continue
-            const nextCharCode = next.content.charCodeAt(0)
+            const nextCharCode = codePointAtSafe(next.content, 0, 0)
             if (nextCharCode !== 0x7B) continue
             child.type = 'softbreak'
             child.tag = ''
@@ -196,7 +198,7 @@ const registerTokenCompat = (md, baseOpt) => {
             child.info = ''
             continue
           }
-          prevTextCharCode = child.content.charCodeAt(child.content.length - 1)
+          prevTextCharCode = codePointBeforeSafe(child.content, child.content.length, 0)
         }
       }
     }

package/src/token-core.js CHANGED Viewed

@@ -3,6 +3,10 @@ import Token from 'markdown-it/lib/token.mjs'
 import {
   CHAR_ASTERISK,
   CHAR_NEWLINE,
+  codePointAtSafe,
+  codePointBeforeSafe,
+  codePointStartBefore,
+  codePointSize,
   isJapaneseChar,
   isAsciiWordCode,
   isSoftSpaceCode,
@@ -280,24 +284,34 @@ const buildScanDelimsLookupCache = (src) => {
   let prev = -1
   for (let i = 0; i < len; i++) {
-    const code = src.charCodeAt(i)
+    const code = codePointAtSafe(src, i)
     if (code === CHAR_NEWLINE) {
       prev = -1
       continue
     }
+    const size = codePointSize(code)
     if (!isSoftSpaceCode(code)) prev = i
     prevNonSpaceSameLine[i] = prev
+    if (size === 2 && i + 1 < len) {
+      prevNonSpaceSameLine[i + 1] = prev
+      i++
+    }
   }
   let next = -1
   for (let i = len - 1; i >= 0; i--) {
-    const code = src.charCodeAt(i)
+    const cpStart = codePointStartBefore(src, i + 1)
+    const code = cpStart === -1 ? 0 : codePointAtSafe(src, cpStart)
     if (code === CHAR_NEWLINE) {
       next = -1
       continue
     }
-    if (!isSoftSpaceCode(code)) next = i
+    if (!isSoftSpaceCode(code)) next = cpStart
     nextNonSpaceSameLine[i] = next
+    if (cpStart !== i) {
+      nextNonSpaceSameLine[cpStart] = next
+      i = cpStart
+    }
   }
   return {
@@ -323,11 +337,13 @@ const findPrevNonSpaceIndex = (src, start, lookupCache = null) => {
       start < lookupCache.prevNonSpaceSameLine.length) {
     return lookupCache.prevNonSpaceSameLine[start]
   }
-  for (let i = start; i >= 0; i--) {
-    const code = src.charCodeAt(i)
+  for (let i = start; i >= 0;) {
+    const cpStart = codePointStartBefore(src, i + 1)
+    if (cpStart === -1) return -1
+    const code = codePointAtSafe(src, cpStart)
     if (code === CHAR_NEWLINE) return -1
-    if (isSoftSpaceCode(code)) continue
-    return i
+    if (!isSoftSpaceCode(code)) return cpStart
+    i = cpStart - 1
   }
   return -1
 }
@@ -340,11 +356,11 @@ const findNextNonSpaceIndex = (src, start, max, lookupCache = null) => {
     const next = lookupCache.nextNonSpaceSameLine[start]
     return next !== -1 && next < max ? next : -1
   }
-  for (let i = start; i < max; i++) {
-    const code = src.charCodeAt(i)
+  for (let i = start; i < max;) {
+    const code = codePointAtSafe(src, i)
     if (code === CHAR_NEWLINE) return -1
-    if (isSoftSpaceCode(code)) continue
-    return i
+    if (!isSoftSpaceCode(code)) return i
+    i += codePointSize(code)
   }
   return -1
 }
@@ -353,26 +369,26 @@ const hasAsciiStartAfterOptionalOpenWrappers = (src, index, max, lookupCache = n
   let i = index
   // Two wrappers are enough for common shapes: * [ "word" ]*
   for (let wrappers = 0; wrappers < 2 && i >= 0 && i < max; wrappers++) {
-    const code = src.charCodeAt(i)
+    const code = codePointAtSafe(src, i)
     if (!isAsciiGuardOpenWrapper(code)) break
     i = findNextNonSpaceIndex(src, i + 1, max, lookupCache)
     if (i === -1) return false
   }
   if (i < 0 || i >= max) return false
-  return isAsciiWordCode(src.charCodeAt(i))
+  return isAsciiWordCode(codePointAtSafe(src, i))
 }
 const hasAsciiEndBeforeOptionalCloseWrappers = (src, index, lookupCache = null) => {
   let i = index
   // Two wrappers are enough for common shapes: *["word"] *
   for (let wrappers = 0; wrappers < 2 && i >= 0; wrappers++) {
-    const code = src.charCodeAt(i)
+    const code = codePointAtSafe(src, i)
     if (!isAsciiGuardCloseWrapper(code)) break
     i = findPrevNonSpaceIndex(src, i - 1, lookupCache)
     if (i === -1) return false
   }
   if (i < 0) return false
-  return isAsciiWordCode(src.charCodeAt(i))
+  return isAsciiWordCode(codePointAtSafe(src, i))
 }
 const isMarkdownStructuralOpenWrapper = (code) => {
@@ -409,18 +425,20 @@ const findPrevNonSpaceLimited = (src, start, maxLook, lookupCache = null) => {
       start < lookupCache.prevNonSpaceSameLine.length) {
     const prev = lookupCache.prevNonSpaceSameLine[start]
     if (prev !== -1 && (start - prev) < maxLook) {
-      return src.charCodeAt(prev)
+      return codePointAtSafe(src, prev)
     }
     return 0
   }
   let looked = 0
-  for (let i = start; i >= 0; i--) {
+  for (let i = start; i >= 0;) {
     if (looked >= maxLook) break
-    const code = src.charCodeAt(i)
-    looked++
+    const cpStart = codePointStartBefore(src, i + 1)
+    if (cpStart === -1) break
+    const code = codePointAtSafe(src, cpStart)
+    looked += i - cpStart + 1
     if (code === CHAR_NEWLINE) return 0
-    if (isSoftSpaceCode(code)) continue
-    return code
+    if (!isSoftSpaceCode(code)) return code
+    i = cpStart - 1
   }
   return 0
 }
@@ -432,18 +450,19 @@ const findNextNonSpaceLimited = (src, start, max, maxLook, lookupCache = null) =
       start < lookupCache.nextNonSpaceSameLine.length) {
     const next = lookupCache.nextNonSpaceSameLine[start]
     if (next !== -1 && next < max && (next - start) < maxLook) {
-      return src.charCodeAt(next)
+      return codePointAtSafe(src, next)
     }
     return 0
   }
   let looked = 0
-  for (let i = start; i < max; i++) {
+  for (let i = start; i < max;) {
     if (looked >= maxLook) break
-    const code = src.charCodeAt(i)
-    looked++
+    const code = codePointAtSafe(src, i)
+    const size = codePointSize(code)
+    looked += size
     if (code === CHAR_NEWLINE) return 0
-    if (isSoftSpaceCode(code)) continue
-    return code
+    if (!isSoftSpaceCode(code)) return code
+    i += size
   }
   return 0
 }
@@ -462,8 +481,8 @@ const hasJapaneseContextForBracketWrapper = (src, start, pos, max, lastChar, nex
 const scanPrevSingleStarContextFlags = (src, start) => {
   let hasJapaneseBetween = false
-  for (let i = start - 1; i >= 0; i--) {
-    const code = src.charCodeAt(i)
+  for (let i = codePointStartBefore(src, start); i >= 0; i = codePointStartBefore(src, i)) {
+    const code = codePointAtSafe(src, i)
     if (code === CHAR_NEWLINE) break
     if (isSentenceBoundaryStop(code) && i < start - 1) break
     if (code !== CHAR_ASTERISK) {
@@ -475,8 +494,8 @@ const scanPrevSingleStarContextFlags = (src, start) => {
       backslashCount++
     }
     if ((backslashCount % 2) === 1) continue
-    const prevCode = i > 0 ? src.charCodeAt(i - 1) : 0
-    const nextCode = i + 1 < src.length ? src.charCodeAt(i + 1) : 0
+    const prevCode = codePointBeforeSafe(src, i, 0)
+    const nextCode = codePointAtSafe(src, i + 1, 0)
     if (prevCode === CHAR_ASTERISK || nextCode === CHAR_ASTERISK) continue
     return hasJapaneseBetween ? PREV_STAR_HAS_OPENER | PREV_STAR_HAS_JP_BETWEEN : PREV_STAR_HAS_OPENER
   }
@@ -778,12 +797,12 @@ const patchScanDelims = (md) => {
     const aggressiveMode = (modeFlags & MODE_FLAG_AGGRESSIVE) !== 0
     const max = this.posMax
     let lookupCache = null
-    const lastChar = start > 0 ? src.charCodeAt(start - 1) : 0x20
+    const lastChar = codePointBeforeSafe(src, start, 0x20)
     const count = base && base.length ? base.length : 1
     const pos = start + count
-    const nextChar = pos < max ? src.charCodeAt(pos) : 0x20
+    const nextChar = codePointAtSafe(src, pos, 0x20)
     let prevStarFlags = -1
     const leftJapanese = isJapaneseChar(lastChar)
@@ -819,7 +838,7 @@ const patchScanDelims = (md) => {
         lookupCache || (lookupCache = getScanDelimsLookupCache(this))
       )
       if (prevNonSpaceIdx !== -1) {
-        const prevNonSpaceLocal = src.charCodeAt(prevNonSpaceIdx)
+        const prevNonSpaceLocal = codePointAtSafe(src, prevNonSpaceIdx)
         const plusStrictAsciiBoundary = plusMode &&
           hasAsciiEndBeforeOptionalCloseWrappers(src, prevNonSpaceIdx, lookupCache)
         if (prevNonSpaceLocal !== CHAR_ASTERISK && !plusStrictAsciiBoundary) {
@@ -835,7 +854,7 @@ const patchScanDelims = (md) => {
         lookupCache || (lookupCache = getScanDelimsLookupCache(this))
       )
       if (nextNonSpaceIdx !== -1) {
-        const nextNonSpace = src.charCodeAt(nextNonSpaceIdx)
+        const nextNonSpace = codePointAtSafe(src, nextNonSpaceIdx)
         const plusStrictAsciiBoundary = plusMode &&
           hasAsciiStartAfterOptionalOpenWrappers(src, nextNonSpaceIdx, max, lookupCache)
         if (nextNonSpace !== CHAR_ASTERISK && !plusStrictAsciiBoundary) {

package/src/token-postprocess/guards.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { isJapaneseChar } from '../token-utils.js'
+import { codePointAtSafe, codePointBeforeSafe, codePointSize, isJapaneseChar } from '../token-utils.js'
 const CHAR_ASTERISK = 0x2A // *
 const INLINE_REPAIR_EM_OUTER_STRONG_SEQUENCE = 1 << 0
@@ -46,11 +46,13 @@ const tokenHasJapaneseChars = (token) => {
     return token.__strongJaHasJapaneseChar
   }
   let hasJapanese = false
-  for (let i = 0; i < content.length; i++) {
-    if (isJapaneseChar(content.charCodeAt(i))) {
+  for (let i = 0; i < content.length;) {
+    const code = codePointAtSafe(content, i)
+    if (isJapaneseChar(code)) {
       hasJapanese = true
       break
     }
+    i += codePointSize(code)
   }
   token.__strongJaJapaneseSource = content
   token.__strongJaHasJapaneseChar = hasJapanese
@@ -103,9 +105,9 @@ const countDelimiterLikeStrongRuns = (content, from = 0, limit = 0) => {
       continue
     }
     const pos = at
-    const prevCode = pos > 0 ? content.charCodeAt(pos - 1) : 0
+    const prevCode = codePointBeforeSafe(content, pos, 0)
     const nextPos = pos + 2
-    const nextCode = nextPos < len ? content.charCodeAt(nextPos) : 0
+    const nextCode = codePointAtSafe(content, nextPos, 0)
     const prevSameMarker = prevCode === CHAR_ASTERISK
     const nextSameMarker = nextCode === CHAR_ASTERISK
     if (prevSameMarker || nextSameMarker) {

package/src/token-utils.js CHANGED Viewed

@@ -18,10 +18,69 @@ const VALID_CANONICAL_MODES = new Set([
 ])
 const REG_JAPANESE = /[\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}\u3000-\u303F\uFF00-\uFFEF]/u
 const REG_ATTRS = /{[^{}\n!@#%^&*()]+?}$/
+const CHAR_REPLACEMENT = 0xFFFD
+const isHighSurrogate = (code) => code >= 0xD800 && code <= 0xDBFF
+const isLowSurrogate = (code) => code >= 0xDC00 && code <= 0xDFFF
+const combineSurrogates = (high, low) => {
+  return 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00)
+}
+const codePointAtSafe = (src, index, fallback = 0) => {
+  if (typeof src !== 'string' || index < 0 || index >= src.length) return fallback
+  const first = src.charCodeAt(index)
+  if (first < 0xD800 || first > 0xDFFF) return first
+  if (first <= 0xDBFF) {
+    const second = index + 1 < src.length ? src.charCodeAt(index + 1) : 0
+    return isLowSurrogate(second) ? combineSurrogates(first, second) : CHAR_REPLACEMENT
+  }
+  return CHAR_REPLACEMENT
+}
+const codePointBeforeSafe = (src, index, fallback = 0) => {
+  if (typeof src !== 'string' || index <= 0 || index > src.length) return fallback
+  const last = src.charCodeAt(index - 1)
+  if (last < 0xD800 || last > 0xDFFF) return last
+  if (last >= 0xDC00) {
+    const first = index - 2 >= 0 ? src.charCodeAt(index - 2) : 0
+    return isHighSurrogate(first) ? combineSurrogates(first, last) : CHAR_REPLACEMENT
+  }
+  return CHAR_REPLACEMENT
+}
+const codePointStartBefore = (src, index) => {
+  if (typeof src !== 'string' || index <= 0 || index > src.length) return -1
+  const lastIdx = index - 1
+  const last = src.charCodeAt(lastIdx)
+  if (isLowSurrogate(last) && lastIdx - 1 >= 0 && isHighSurrogate(src.charCodeAt(lastIdx - 1))) {
+    return lastIdx - 1
+  }
+  return lastIdx
+}
+const codePointSize = (code) => code > 0xFFFF ? 2 : 1
+const isAstralJapaneseCode = (code) => {
+  return (code >= 0x1AFF0 && code <= 0x1AFFF) || // Kana Extended-B
+    (code >= 0x1B000 && code <= 0x1B0FF) || // Kana Supplement
+    (code >= 0x1B100 && code <= 0x1B12F) || // Kana Extended-A
+    (code >= 0x1B130 && code <= 0x1B16F) || // Small Kana Extension
+    (code >= 0x20000 && code <= 0x2A6DF) || // CJK Unified Ideographs Extension B
+    (code >= 0x2A700 && code <= 0x2B73F) || // Extension C
+    (code >= 0x2B740 && code <= 0x2B81F) || // Extension D
+    (code >= 0x2B820 && code <= 0x2CEAF) || // Extension E
+    (code >= 0x2CEB0 && code <= 0x2EBEF) || // Extension F
+    (code >= 0x2EBF0 && code <= 0x2EE5F) || // Extension I
+    (code >= 0x2F800 && code <= 0x2FA1F) || // CJK Compatibility Ideographs Supplement
+    (code >= 0x30000 && code <= 0x3134F) || // Extension G
+    (code >= 0x31350 && code <= 0x323AF) // Extension H
+}
 const isJapaneseChar = (ch) => {
   if (!ch) return false
-  const code = typeof ch === 'string' ? ch.charCodeAt(0) : ch
+  const code = typeof ch === 'string' ? ch.codePointAt(0) : ch
+  if (!Number.isFinite(code)) return false
   if (code < 128) return false
   if (code >= 0x3040 && code <= 0x309F) return true
   if (code >= 0x30A0 && code <= 0x30FF) return true
@@ -32,7 +91,10 @@ const isJapaneseChar = (ch) => {
   if (code >= 0xF900 && code <= 0xFAFF) return true
   if (code >= 0x3000 && code <= 0x303F) return true
   if (code >= 0xFF00 && code <= 0xFFEF) return true
-  return REG_JAPANESE.test(String.fromCharCode(code))
+  if (code > 0x10FFFF) return false
+  if (code >= 0x10000 && isAstralJapaneseCode(code)) return true
+  if (code >= 0x10000 && code < 0x20000) return false
+  return REG_JAPANESE.test(String.fromCodePoint(code))
 }
 const isAsciiWordCode = (code) => {
@@ -257,6 +319,10 @@ export {
   CHAR_NEWLINE,
   CHAR_IDEOGRAPHIC_SPACE,
   REG_ATTRS,
+  codePointAtSafe,
+  codePointBeforeSafe,
+  codePointStartBefore,
+  codePointSize,
   isJapaneseChar,
   isAsciiWordCode,
   isSoftSpaceCode,