PyPI - wcwidth - Versions diffs - 0.2.12__tar.gz → 0.2.13__tar.gz - Mend

wcwidth 0.2.12tar.gz → 0.2.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wcwidth might be problematic. Click here for more details.

Files changed (52) hide show

{wcwidth-0.2.12/wcwidth.egg-info → wcwidth-0.2.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wcwidth
-Version: 0.2.12
+Version: 0.2.13
 Summary: Measures the displayed width of unicode strings in a terminal
 Home-page: https://github.com/jquast/wcwidth
 Author: Jeff Quast
@@ -63,7 +63,7 @@ Example
    >>>  text = u'コンニチハ'
 Python **incorrectly** uses the *string length* of 5 codepoints rather than the
-*printible length* of 10 cells, so that when using the `rjust` function, the
+*printable length* of 10 cells, so that when using the `rjust` function, the
 output length is wrong::
     >>> print(len('コンニチハ'))
@@ -247,8 +247,12 @@ Other Languages
 =======
 History
 =======
+0.2.13 *2024-01-06*
+  * **Bugfix** zero-width support for Hangul Jamo (Korean)
 0.2.12 *2023-11-21*
-  * re-release to remove .pyi file misplaced in wheel files `Issue #101`.
+  * re-release to remove .pyi file misplaced in wheel files `Issue #101`_.
 0.2.11 *2023-11-20*
   * Include tests files in the source distribution (`PR #98`_, `PR #100`_).

{wcwidth-0.2.12 → wcwidth-0.2.13}/README.rst RENAMED Viewed

@@ -32,7 +32,7 @@ Example
    >>>  text = u'コンニチハ'
 Python **incorrectly** uses the *string length* of 5 codepoints rather than the
-*printible length* of 10 cells, so that when using the `rjust` function, the
+*printable length* of 10 cells, so that when using the `rjust` function, the
 output length is wrong::
     >>> print(len('コンニチハ'))
@@ -216,8 +216,12 @@ Other Languages
 =======
 History
 =======
+0.2.13 *2024-01-06*
+  * **Bugfix** zero-width support for Hangul Jamo (Korean)
 0.2.12 *2023-11-21*
-  * re-release to remove .pyi file misplaced in wheel files `Issue #101`.
+  * re-release to remove .pyi file misplaced in wheel files `Issue #101`_.
 0.2.11 *2023-11-20*
   * Include tests files in the source distribution (`PR #98`_, `PR #100`_).

{wcwidth-0.2.12 → wcwidth-0.2.13}/bin/update-tables.py RENAMED Viewed

@@ -54,6 +54,19 @@ FETCH_BLOCKSIZE = int(os.environ.get('FETCH_BLOCKSIZE', '4096'))
 MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '6'))
 BACKOFF_FACTOR = float(os.environ.get('BACKOFF_FACTOR', '0.1'))
+# Hangul Jamo is a decomposed form of Hangul Syllables, see
+# see https://www.unicode.org/faq/korean.html#3
+#     https://github.com/ridiculousfish/widecharwidth/pull/17
+#     https://github.com/jquast/ucs-detect/issues/9
+#     https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
+# "Conjoining Jamo are divided into three classes: L, V, T (Leading
+#  consonant, Vowel, Trailing consonant). A Hangul Syllable consists of
+#  <LV> or <LVT> sequences."
+HANGUL_JAMO_ZEROWIDTH = (
+    *range(0x1160, 0x1200),  # Hangul Jungseong Filler .. Hangul Jongseong Ssangnieun
+    *range(0xD7B0, 0xD800),  # Hangul Jungseong O-Yeo  .. Undefined Character of Hangul Jamo Extended-B
+)
 def _bisearch(ucs, table):
     """A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
@@ -112,11 +125,11 @@ class TableEntry:
     properties: tuple[str, ...]
     comment: str
-    def filter_by_category(self, category_codes: str, wide: int) -> bool:
+    def filter_by_category_width(self, wide: int) -> bool:
         """
-        Return whether entry matches given category code and displayed width.
+        Return whether entry matches displayed width.
-        Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
+        Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
         """
         if self.code_range is None:
             return False
@@ -146,13 +159,12 @@ class TableEntry:
         return wide == 1
     @staticmethod
-    def parse_category_values(category_codes: str,
-                              table_iter: Iterator[TableEntry],
-                              wide: int) -> set[tuple[int, int]]:
+    def parse_width_category_values(table_iter: Iterator[TableEntry],
+                                    wide: int) -> set[tuple[int, int]]:
         """Parse value ranges of unicode data files, by given category and width."""
         return {n
                 for entry in table_iter
-                if entry.filter_by_category(category_codes, wide)
+                if entry.filter_by_category_width(wide)
                 for n in list(range(entry.code_range[0], entry.code_range[1]))}
@@ -326,18 +338,19 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
     for version in fetch_unicode_versions():
         # parse typical 'wide' characters by categories 'W' and 'F',
         table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
-                                        category_codes=('W', 'F'),
                                         wide=2)
         # subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
         # but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
-        table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                     category_codes=('Mn', 'Mc'),
-                                                     wide=0).values)
+        table[version].values = table[version].values.difference(parse_category(
+            fname=UnicodeDataFile.DerivedGeneralCategory(version),
+            wide=0).values)
+        # Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
+        table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)
         # finally, join with atypical 'wide' characters defined by category 'Sk',
         table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                    category_codes=('Sk',),
                                                     wide=2).values)
     return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
@@ -352,11 +365,13 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
     for version in fetch_unicode_versions():
         # Determine values of zero-width character lookup table by the following category codes
         table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                        category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
                                         wide=0)
-        # And, include NULL
+        # Include NULL
         table[version].values.add(0)
+        # Add Hangul Jamo Vowels and Hangul Trailing Consonants
+        table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
     return UnicodeTableRenderCtx('ZERO_WIDTH', table)
@@ -501,9 +516,9 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
 @functools.cache
-def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
+def parse_category(fname: str, wide: int) -> TableDef:
     """Parse value ranges of unicode data files, by given categories into string tables."""
-    print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
+    print(f'parsing {fname}, wide={wide}: ', end='', flush=True)
     with open(fname, encoding='utf-8') as f:
         table_iter = parse_unicode_table(f)
@@ -512,7 +527,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
         version = next(table_iter).comment.strip()
         # and "date string" from second line
         date = next(table_iter).comment.split(':', 1)[1].strip()
-        values = TableEntry.parse_category_values(category_codes, table_iter, wide)
+        values = TableEntry.parse_width_category_values(table_iter, wide)
     print('ok')
     return TableDef(version, date, values)

{wcwidth-0.2.12 → wcwidth-0.2.13}/bin/verify-table-integrity.py RENAMED Viewed

@@ -63,9 +63,30 @@ Category code was changed from 'Mc' to 'Lo':
 import logging
+def bisearch_pair(ucs, table):
+    """
+    A copy of wcwidth._bisearch() but also returns the range of matched values.
+    """
+    lbound = 0
+    ubound = len(table) - 1
+    if ucs < table[0][0] or ucs > table[ubound][1]:
+        return (0, None, None)
+    while ubound >= lbound:
+        mid = (lbound + ubound) // 2
+        if ucs > table[mid][1]:
+            lbound = mid + 1
+        elif ucs < table[mid][0]:
+            ubound = mid - 1
+        else:
+            return (1, table[mid][0], table[mid][1])
+    return (0, None, None)
 def main(log: logging.Logger):
-    # local
-    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
+    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions
     reversed_uni_versions = list(reversed(list_versions()))
     tables = {'ZERO_WIDTH': ZERO_WIDTH,
               'WIDE_EASTASIAN': WIDE_EASTASIAN}
@@ -81,14 +102,21 @@ def main(log: logging.Logger):
             other_table = tables[other_table_name][version]
             for start_range, stop_range in curr_table:
                 for unichar_n in range(start_range, stop_range):
-                    if not _bisearch(unichar_n, next_table):
-                        log.info(f'value {hex(unichar_n)} in table_name={table_name}'
-                                 f' version={version} is not defined in next_version={next_version}'
-                                 f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
-                    if _bisearch(unichar_n, other_table):
-                        log.error(f'value {hex(unichar_n)} in table_name={table_name}'
-                                  f' version={version} is duplicated in other_table_name={other_table_name}'
-                                  f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
+                    result, _, _ = bisearch_pair(unichar_n, next_table)
+                    if not result:
+                        log.info(
+                            f'value 0x{unichar_n:05x} in table_name={table_name}'
+                            f' version={version} is not defined in next_version={next_version}'
+                            f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
+                        )
+                    result, lbound, ubound = bisearch_pair(unichar_n, other_table)
+                    if result:
+                        log.error(
+                            f'value 0x{unichar_n:05x} in table_name={table_name}'
+                            f' version={version} is duplicated in other_table_name={other_table_name}'
+                            f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
+                            f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
+                        )
                         errors += 1
     if errors:
         log.error(f'{errors} errors, exit 1')

{wcwidth-0.2.12 → wcwidth-0.2.13}/bin/wcwidth-browser.py RENAMED Viewed

@@ -116,7 +116,7 @@ class WcCombinedCharacterGenerator(object):
         """
         self.characters = []
         letters_o = ('o' * width)
-        for (begin, end) in ZERO_WIDTH[unicode_version]:
+        for (begin, end) in ZERO_WIDTH[_wcmatch_version(unicode_version)]:
             for val in [_val for _val in
                         range(begin, end + 1)
                         if _val <= LIMIT_UCS]:

{wcwidth-0.2.12 → wcwidth-0.2.13}/docs/intro.rst RENAMED Viewed

@@ -32,7 +32,7 @@ Example
    >>>  text = u'コンニチハ'
 Python **incorrectly** uses the *string length* of 5 codepoints rather than the
-*printible length* of 10 cells, so that when using the `rjust` function, the
+*printable length* of 10 cells, so that when using the `rjust` function, the
 output length is wrong::
     >>> print(len('コンニチハ'))
@@ -216,8 +216,12 @@ Other Languages
 =======
 History
 =======
+0.2.13 *2024-01-06*
+  * **Bugfix** zero-width support for Hangul Jamo (Korean)
 0.2.12 *2023-11-21*
-  * re-release to remove .pyi file misplaced in wheel files `Issue #101`.
+  * re-release to remove .pyi file misplaced in wheel files `Issue #101`_.
 0.2.11 *2023-11-20*
   * Include tests files in the source distribution (`PR #98`_, `PR #100`_).

wcwidth-0.2.13/docs/specs.rst ADDED Viewed

@@ -0,0 +1,79 @@
+.. _Specification:
+=============
+Specification
+=============
+This document defines how the wcwidth library measures the printable width
+of characters of a string.
+Width of -1
+-----------
+The following have a column width of -1 for function :func:`wcwidth.wcwidth`
+- ``C0`` control characters (`U+0001`_ through `U+001F`_).
+- ``C1`` control characters and ``DEL`` (`U+007F`_ through `U+00A0`_).
+If any character in sequence contains ``C0`` or ``C1`` control characters, the final
+return value of of :func:`wcwidth.wcswidth` is -1.
+Width of 0
+----------
+Any characters defined by category codes in `DerivedGeneralCategory.txt`_ files:
+- 'Me': Enclosing Combining Mark, aprox. 13 characters.
+- 'Mn': Nonspacing Combining Mark, aprox. 1,839 characters.
+- 'Mc': Spacing Mark, aprox. 443 characters.
+- 'Cf': Format control character, aprox. 161 characters.
+- 'Zl': `U+2028`_ LINE SEPARATOR only
+- 'Zp': `U+2029`_ PARAGRAPH SEPARATOR only
+- 'Sk': Modifier Symbol, aprox. 4 characters of only those where phrase
+  ``'EMOJI MODIFIER'`` is present in comment of unicode data file.
+The NULL character (`U+0000`_).
+Any character following ZWJ (`U+200D`_) when in sequence by
+function :func:`wcwidth.wcswidth`.
+Hangul Jamo Jungseong and "Extended-B" code blocks, `U+1160`_ through
+`U+11FF`_ and `U+D7B0`_ through `U+D7FF`_.
+Width of 1
+----------
+String characters are measured width of 1 when they are not
+measured as `Width of 0`_ or `Width of 2`_.
+Width of 2
+----------
+Any character defined by East Asian Fullwidth (``F``) or Wide (``W``)
+properties in `EastAsianWidth.txt`_ files, except those that are defined by the
+Category codes of Nonspacing Mark (``Mn``) and Spacing Mark (``Mc``).
+Any characters of Modifier Symbol category, ``'Sk'`` where ``'FULLWIDTH'`` is
+present in comment of unicode data file, aprox. 3 characters.
+Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
+`emoji-variation-sequences.txt`_ as ``emoji style``.
+.. _`U+0000`: https://codepoints.net/U+0000
+.. _`U+0001`: https://codepoints.net/U+0001
+.. _`U+001F`: https://codepoints.net/U+001F
+.. _`U+007F`: https://codepoints.net/U+007F
+.. _`U+00A0`: https://codepoints.net/U+00A0
+.. _`U+1160`: https://codepoints.net/U+1160
+.. _`U+11FF`: https://codepoints.net/U+11FF
+.. _`U+200D`: https://codepoints.net/U+200D
+.. _`U+2028`: https://codepoints.net/U+2028
+.. _`U+2029`: https://codepoints.net/U+2029
+.. _`U+D7B0`: https://codepoints.net/U+D7B0
+.. _`U+D7FF`: https://codepoints.net/U+D7FF
+.. _`U+FE0F`: https://codepoints.net/U+FE0F
+.. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
+.. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt`
+.. _`emoji-variation-sequences.txt`: https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-variation-sequences.txt

{wcwidth-0.2.12 → wcwidth-0.2.13}/setup.py RENAMED Viewed

@@ -44,7 +44,7 @@ def main():
     setuptools.setup(
         name='wcwidth',
         # NOTE: manually manage __version__ in wcwidth/__init__.py !
-        version='0.2.12',
+        version='0.2.13',
         description=(
             "Measures the displayed width of unicode strings in a terminal"),
         long_description=codecs.open(

{wcwidth-0.2.12 → wcwidth-0.2.13}/tests/test_core.py RENAMED Viewed

@@ -222,17 +222,48 @@ def test_balinese_script():
     assert length_phrase == expect_length_phrase
+def test_kr_jamo():
+    """
+    Test basic combining of HANGUL CHOSEONG and JUNGSEONG
+    Example and from Raymond Chen's blog post,
+    https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
+    """
+    # This is an example where both characters are "wide" when displayed alone.
+    #
+    # But JUNGSEONG (vowel) is designed for combination with a CHOSEONG (consonant).
+    #
+    # This wcwidth library understands their width only when combination,
+    # and not by independent display, like other zero-width characters that may
+    # only combine with an appropriate preceding character.
+    phrase = (
+        u"\u1100"  # ᄀ HANGUL CHOSEONG KIYEOK (consonant)
+        u"\u1161"  # ᅡ HANGUL JUNGSEONG A (vowel)
+    )
+    expect_length_each = (2, 0)
+    expect_length_phrase = 2
+    # exercise,
+    length_each = tuple(map(wcwidth.wcwidth, phrase))
+    length_phrase = wcwidth.wcswidth(phrase)
+    # verify.
+    assert length_each == expect_length_each
+    assert length_phrase == expect_length_phrase
 def test_kr_jamo_filler():
     u"""
     Jamo filler is 0 width.
-    According to https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf this character and others
-    like it, ``\uffa0``, ``\u1160``, ``\u115f``, ``\u1160``, are not commonly viewed with a terminal,
-    seems it doesn't matter whether it is implemented or not, they are not typically used !
+    Example from https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf
     """
-    phrase = u"\u1100\u1160"
-    expect_length_each = (2, 1)
-    expect_length_phrase = 3
+    phrase = (
+        u"\u1100"  # HANGUL CHOSEONG KIYEOK (consonant)
+        u"\u1160"  # HANGUL JUNGSEONG FILLER (vowel)
+    )
+    expect_length_each = (2, 0)
+    expect_length_phrase = 2
     # exercise,
     length_each = tuple(map(wcwidth.wcwidth, phrase))
@@ -355,3 +386,17 @@ def test_kannada_script_2():
     # verify.
     assert length_each == expect_length_each
     assert length_phrase == expect_length_phrase
+def test_zero_wide_conflict():
+    # Test characters considered both "wide" and "zero" width
+    # -  (0x03000, 0x0303e,),  # Ideographic Space       ..Ideographic Variation In
+    # +  (0x03000, 0x03029,),  # Ideographic Space       ..Hangzhou Numeral Nine
+    assert wcwidth.wcwidth(unichr(0x03029), unicode_version='4.1.0') == 2
+    assert wcwidth.wcwidth(unichr(0x0302a), unicode_version='4.1.0') == 0
+    # - (0x03099, 0x030ff,),  # Combining Katakana-hirag..Katakana Digraph Koto
+    # + (0x0309b, 0x030ff,),  # Katakana-hiragana Voiced..Katakana Digraph Koto
+    assert wcwidth.wcwidth(unichr(0x03099), unicode_version='4.1.0') == 0
+    assert wcwidth.wcwidth(unichr(0x0309a), unicode_version='4.1.0') == 0
+    assert wcwidth.wcwidth(unichr(0x0309b), unicode_version='4.1.0') == 2

wcwidth-0.2.13/tests/test_table_integrity.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Executes verify-table-integrity.py as a unit test.
+"""
+import os
+import sys
+import subprocess
+import pytest
+@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
+def test_verify_table_integrity():
+    subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
+                                                          os.path.pardir,
+                                                          'bin',
+                                                          'verify-table-integrity.py')])

{wcwidth-0.2.12 → wcwidth-0.2.13}/wcwidth/__init__.py RENAMED Viewed

@@ -26,4 +26,4 @@ __all__ = ('wcwidth', 'wcswidth', 'list_versions')
 # We also used pkg_resources to load unicode version tables from version.json,
 # generated by bin/update-tables.py, but some environments are unable to
 # import pkg_resources for one reason or another, yikes!
-__version__ = '0.2.12'
+__version__ = '0.2.13'

wcwidth 0.2.12__tar.gz → 0.2.13__tar.gz

Potentially problematic release.

wcwidth 0.2.12tar.gz → 0.2.13tar.gz