sentencex 0.5.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentencex might be problematic. Click here for more details.

Files changed (84) hide show
  1. {sentencex-0.5.1 → sentencex-0.6.0}/PKG-INFO +1 -1
  2. {sentencex-0.5.1 → sentencex-0.6.0}/pyproject.toml +1 -1
  3. sentencex-0.6.0/sentencex/terminators.py +165 -0
  4. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_am.py +1 -1
  5. sentencex-0.5.1/sentencex/terminators.py +0 -143
  6. {sentencex-0.5.1 → sentencex-0.6.0}/.github/workflows/publish.yaml +0 -0
  7. {sentencex-0.5.1 → sentencex-0.6.0}/.github/workflows/tests.yaml +0 -0
  8. {sentencex-0.5.1 → sentencex-0.6.0}/.gitignore +0 -0
  9. {sentencex-0.5.1 → sentencex-0.6.0}/LICENSE.txt +0 -0
  10. {sentencex-0.5.1 → sentencex-0.6.0}/README.md +0 -0
  11. {sentencex-0.5.1 → sentencex-0.6.0}/benchmarks/__init__.py +0 -0
  12. {sentencex-0.5.1 → sentencex-0.6.0}/benchmarks/accuracy.py +0 -0
  13. {sentencex-0.5.1 → sentencex-0.6.0}/benchmarks/benchmark_speed.sh +0 -0
  14. {sentencex-0.5.1 → sentencex-0.6.0}/benchmarks/en_golden_rules.py +0 -0
  15. {sentencex-0.5.1 → sentencex-0.6.0}/benchmarks/requirements.txt +0 -0
  16. {sentencex-0.5.1 → sentencex-0.6.0}/benchmarks/speed.py +0 -0
  17. {sentencex-0.5.1 → sentencex-0.6.0}/docs/index.html +0 -0
  18. {sentencex-0.5.1 → sentencex-0.6.0}/requirements.txt +0 -0
  19. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/__init__.py +0 -0
  20. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/__main__.py +0 -0
  21. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/base.py +0 -0
  22. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/fallbacks.py +0 -0
  23. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/__init__.py +0 -0
  24. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/am.py +0 -0
  25. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/ar.py +0 -0
  26. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/bg.py +0 -0
  27. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/bn.py +0 -0
  28. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/ca.py +0 -0
  29. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/da.py +0 -0
  30. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/de.py +0 -0
  31. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/el.py +0 -0
  32. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/en.py +0 -0
  33. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/es.py +0 -0
  34. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/fi.py +0 -0
  35. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/fr.py +0 -0
  36. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/gu.py +0 -0
  37. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/hi.py +0 -0
  38. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/hy.py +0 -0
  39. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/it.py +0 -0
  40. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/kk.py +0 -0
  41. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/kn.py +0 -0
  42. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/ml.py +0 -0
  43. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/mr.py +0 -0
  44. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/my.py +0 -0
  45. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/nl.py +0 -0
  46. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/or_.py +0 -0
  47. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/pa.py +0 -0
  48. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/pl.py +0 -0
  49. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/pt.py +0 -0
  50. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/ru.py +0 -0
  51. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/sk.py +0 -0
  52. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/ta.py +0 -0
  53. {sentencex-0.5.1 → sentencex-0.6.0}/sentencex/languages/te.py +0 -0
  54. {sentencex-0.5.1 → sentencex-0.6.0}/test/__init__.py +0 -0
  55. {sentencex-0.5.1 → sentencex-0.6.0}/test/pytest.ini +0 -0
  56. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_ar.py +0 -0
  57. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_bg.py +0 -0
  58. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_da.py +0 -0
  59. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_de.py +0 -0
  60. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_el.py +0 -0
  61. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_en.py +0 -0
  62. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_es.py +0 -0
  63. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_fa.py +0 -0
  64. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_fallbacks.py +0 -0
  65. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_fi.py +0 -0
  66. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_fr.py +0 -0
  67. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_gu.py +0 -0
  68. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_hi.py +0 -0
  69. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_hy.py +0 -0
  70. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_it.py +0 -0
  71. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_ja.py +0 -0
  72. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_kk.py +0 -0
  73. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_ml.py +0 -0
  74. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_mr.py +0 -0
  75. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_my.py +0 -0
  76. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_nl.py +0 -0
  77. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_pa.py +0 -0
  78. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_pl.py +0 -0
  79. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_pt.py +0 -0
  80. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_ru.py +0 -0
  81. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_sk.py +0 -0
  82. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_ur.py +0 -0
  83. {sentencex-0.5.1 → sentencex-0.6.0}/test/unit/test_zh.py +0 -0
  84. {sentencex-0.5.1 → sentencex-0.6.0}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sentencex
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Sentence segmenter that supports ~300 languages
5
5
  Project-URL: Homepage, https://github.com/santhoshtr/sentencex
6
6
  Project-URL: Changelog, https://github.com/santhoshtr/sentencex/releases
@@ -29,7 +29,7 @@ classifiers = [
29
29
  "License :: OSI Approved :: MIT License",
30
30
  "Operating System :: OS Independent",
31
31
  ]
32
- version = "0.5.1"
32
+ version = "0.6.0"
33
33
  requires-python = ">=3.8"
34
34
  dependencies = []
35
35
 
@@ -0,0 +1,165 @@
1
+ # unicode code points generated with Unicode::Tussle perl script:
2
+ # unichars -aBbs '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' | awk '$2="\""$2"\", #"'
3
+ # ruff: noqa: E501
4
+ GLOBAL_SENTENCE_TERMINATORS = (
5
+ [
6
+ "!", # U+00021 BC=ON BLK=Basic_Latin SC=Common EXCLAMATION MARK
7
+ ".", # U+0002E BC=CS BLK=Basic_Latin SC=Common FULL STOP
8
+ "?", # U+0003F BC=ON BLK=Basic_Latin SC=Common QUESTION MARK
9
+ "։", # U+00589 BC=L BLK=Armenian SC=Armenian ARMENIAN FULL STOP
10
+ "؝", # U+0061D BC=AL BLK=Arabic SC=Arabic ARABIC END OF TEXT MARK
11
+ "؞", # U+0061E BC=AL BLK=Arabic SC=Arabic ARABIC TRIPLE DOT PUNCTUATION MARK
12
+ "؟", # U+0061F BC=AL BLK=Arabic SC=Common ARABIC QUESTION MARK
13
+ "۔", # U+006D4 BC=AL BLK=Arabic SC=Arabic ARABIC FULL STOP
14
+ "܀", # U+00700 BC=AL BLK=Syriac SC=Syriac SYRIAC END OF PARAGRAPH
15
+ "܁", # U+00701 BC=AL BLK=Syriac SC=Syriac SYRIAC SUPRALINEAR FULL STOP
16
+ "܂", # U+00702 BC=AL BLK=Syriac SC=Syriac SYRIAC SUBLINEAR FULL STOP
17
+ "߹", # U+007F9 BC=ON BLK=NKo SC=Nko NKO EXCLAMATION MARK
18
+ "࠷", # U+00837 BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION MELODIC QITSA
19
+ "࠹", # U+00839 BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION QITSA
20
+ "࠽", # U+0083D BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION SOF MASHFAAT
21
+ "࠾", # U+0083E BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION ANNAAU
22
+ "।", # U+00964 BC=L BLK=Devanagari SC=Common DEVANAGARI DANDA
23
+ "॥", # U+00965 BC=L BLK=Devanagari SC=Common DEVANAGARI DOUBLE DANDA
24
+ "၊", # U+0104A BC=L BLK=Myanmar SC=Myanmar MYANMAR SIGN LITTLE SECTION
25
+ "။", # U+0104B BC=L BLK=Myanmar SC=Myanmar MYANMAR SIGN SECTION
26
+ "።", # U+01362 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC FULL STOP
27
+ "፧", # U+01367 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC QUESTION MARK
28
+ "፨", # U+01368 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC PARAGRAPH SEPARATOR
29
+ "᙮", # U+0166E BC=L BLK=Unified_Canadian_Aboriginal_Syllabics SC=Canadian_Aboriginal CANADIAN SYLLABICS FULL STOP
30
+ "᜵", # U+01735 BC=L BLK=Hanunoo SC=Common PHILIPPINE SINGLE PUNCTUATION
31
+ "᜶", # U+01736 BC=L BLK=Hanunoo SC=Common PHILIPPINE DOUBLE PUNCTUATION
32
+ "᠃", # U+01803 BC=ON BLK=Mongolian SC=Common MONGOLIAN FULL STOP
33
+ "᠉", # U+01809 BC=ON BLK=Mongolian SC=Mongolian MONGOLIAN MANCHU FULL STOP
34
+ "᥄", # U+01944 BC=ON BLK=Limbu SC=Limbu LIMBU EXCLAMATION MARK
35
+ "᥅", # U+01945 BC=ON BLK=Limbu SC=Limbu LIMBU QUESTION MARK
36
+ "᪨", # U+01AA8 BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN KAAN
37
+ "᪩", # U+01AA9 BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN KAANKUU
38
+ "᪪", # U+01AAA BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN SATKAAN
39
+ "᪫", # U+01AAB BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN SATKAANKUU
40
+ "᭚", # U+01B5A BC=L BLK=Balinese SC=Balinese BALINESE PANTI
41
+ "᭛", # U+01B5B BC=L BLK=Balinese SC=Balinese BALINESE PAMADA
42
+ "᭞", # U+01B5E BC=L BLK=Balinese SC=Balinese BALINESE CARIK SIKI
43
+ "᭟", # U+01B5F BC=L BLK=Balinese SC=Balinese BALINESE CARIK PAREREN
44
+ "᭽", # U+01B7D BC=L BLK=Balinese SC=Balinese BALINESE PANTI LANTANG
45
+ "᭾", # U+01B7E BC=L BLK=Balinese SC=Balinese BALINESE PAMADA LANTANG
46
+ "᰻", # U+01C3B BC=L BLK=Lepcha SC=Lepcha LEPCHA PUNCTUATION TA-ROL
47
+ "᰼", # U+01C3C BC=L BLK=Lepcha SC=Lepcha LEPCHA PUNCTUATION NYET THYOOM TA-ROL
48
+ "᱾", # U+01C7E BC=L BLK=Ol_Chiki SC=Ol_Chiki OL CHIKI PUNCTUATION MUCAAD
49
+ "᱿", # U+01C7F BC=L BLK=Ol_Chiki SC=Ol_Chiki OL CHIKI PUNCTUATION DOUBLE MUCAAD
50
+ "․", # U+02024 BC=ON BLK=General_Punctuation SC=Common ONE DOT LEADER
51
+ "‼", # U+0203C BC=ON BLK=General_Punctuation SC=Common DOUBLE EXCLAMATION MARK
52
+ "‽", # U+0203D BC=ON BLK=General_Punctuation SC=Common INTERROBANG
53
+ "⁇", # U+02047 BC=ON BLK=General_Punctuation SC=Common DOUBLE QUESTION MARK
54
+ "⁈", # U+02048 BC=ON BLK=General_Punctuation SC=Common QUESTION EXCLAMATION MARK
55
+ "⁉", # U+02049 BC=ON BLK=General_Punctuation SC=Common EXCLAMATION QUESTION MARK
56
+ "⸮", # U+02E2E BC=ON BLK=Supplemental_Punctuation SC=Common REVERSED QUESTION MARK
57
+ "⸼", # U+02E3C BC=ON BLK=Supplemental_Punctuation SC=Common STENOGRAPHIC FULL STOP
58
+ "⹓", # U+02E53 BC=ON BLK=Supplemental_Punctuation SC=Common MEDIEVAL EXCLAMATION MARK
59
+ "⹔", # U+02E54 BC=ON BLK=Supplemental_Punctuation SC=Common MEDIEVAL QUESTION MARK
60
+ "꓿", # U+0A4FF BC=L BLK=Lisu SC=Lisu LISU PUNCTUATION FULL STOP
61
+ "꘎", # U+0A60E BC=ON BLK=Vai SC=Vai VAI FULL STOP
62
+ "꘏", # U+0A60F BC=ON BLK=Vai SC=Vai VAI QUESTION MARK
63
+ "꛳", # U+0A6F3 BC=L BLK=Bamum SC=Bamum BAMUM FULL STOP
64
+ "꛷", # U+0A6F7 BC=L BLK=Bamum SC=Bamum BAMUM QUESTION MARK
65
+ "꡶", # U+0A876 BC=ON BLK=Phags-pa SC=Phags_Pa PHAGS-PA MARK SHAD
66
+ "꡷", # U+0A877 BC=ON BLK=Phags-pa SC=Phags_Pa PHAGS-PA MARK DOUBLE SHAD
67
+ "꣎", # U+0A8CE BC=L BLK=Saurashtra SC=Saurashtra SAURASHTRA DANDA
68
+ "꣏", # U+0A8CF BC=L BLK=Saurashtra SC=Saurashtra SAURASHTRA DOUBLE DANDA
69
+ "꤯", # U+0A92F BC=L BLK=Kayah_Li SC=Kayah_Li KAYAH LI SIGN SHYA
70
+ "꧈", # U+0A9C8 BC=L BLK=Javanese SC=Javanese JAVANESE PADA LINGSA
71
+ "꧉", # U+0A9C9 BC=L BLK=Javanese SC=Javanese JAVANESE PADA LUNGSI
72
+ "꩝", # U+0AA5D BC=L BLK=Cham SC=Cham CHAM PUNCTUATION DANDA
73
+ "꩞", # U+0AA5E BC=L BLK=Cham SC=Cham CHAM PUNCTUATION DOUBLE DANDA
74
+ "꩟", # U+0AA5F BC=L BLK=Cham SC=Cham CHAM PUNCTUATION TRIPLE DANDA
75
+ "꫰", # U+0AAF0 BC=L BLK=Meetei_Mayek_Extensions SC=Meetei_Mayek MEETEI MAYEK CHEIKHAN
76
+ "꫱", # U+0AAF1 BC=L BLK=Meetei_Mayek_Extensions SC=Meetei_Mayek MEETEI MAYEK AHANG KHUDAM
77
+ "꯫", # U+0ABEB BC=L BLK=Meetei_Mayek SC=Meetei_Mayek MEETEI MAYEK CHEIKHEI
78
+ "﹒", # U+0FE52 BC=CS BLK=Small_Form_Variants SC=Common SMALL FULL STOP
79
+ "﹖", # U+0FE56 BC=ON BLK=Small_Form_Variants SC=Common SMALL QUESTION MARK
80
+ "﹗", # U+0FE57 BC=ON BLK=Small_Form_Variants SC=Common SMALL EXCLAMATION MARK
81
+ "!", # U+0FF01 BC=ON BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH EXCLAMATION MARK
82
+ ".", # U+0FF0E BC=CS BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH FULL STOP
83
+ "?", # U+0FF1F BC=ON BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH QUESTION MARK
84
+ "𐩖", # U+10A56 BC=R BLK=Kharoshthi SC=Kharoshthi KHAROSHTHI PUNCTUATION DANDA
85
+ "𐩗", # U+10A57 BC=R BLK=Kharoshthi SC=Kharoshthi KHAROSHTHI PUNCTUATION DOUBLE DANDA
86
+ "𐽕", # U+10F55 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO VERTICAL BARS
87
+ "𐽖", # U+10F56 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO VERTICAL BARS WITH DOTS
88
+ "𐽗", # U+10F57 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION CIRCLE WITH DOT
89
+ "𐽘", # U+10F58 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO CIRCLES WITH DOTS
90
+ "𐽙", # U+10F59 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
91
+ "𐾆", # U+10F86 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION BAR
92
+ "𐾇", # U+10F87 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION TWO BARS
93
+ "𐾈", # U+10F88 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION TWO DOTS
94
+ "𐾉", # U+10F89 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION FOUR DOTS
95
+ "𑁇", # U+11047 BC=L BLK=Brahmi SC=Brahmi BRAHMI DANDA
96
+ "𑁈", # U+11048 BC=L BLK=Brahmi SC=Brahmi BRAHMI DOUBLE DANDA
97
+ "𑂾", # U+110BE BC=L BLK=Kaithi SC=Kaithi KAITHI SECTION MARK
98
+ "𑂿", # U+110BF BC=L BLK=Kaithi SC=Kaithi KAITHI DOUBLE SECTION MARK
99
+ "𑃀", # U+110C0 BC=L BLK=Kaithi SC=Kaithi KAITHI DANDA
100
+ "𑃁", # U+110C1 BC=L BLK=Kaithi SC=Kaithi KAITHI DOUBLE DANDA
101
+ "𑅁", # U+11141 BC=L BLK=Chakma SC=Chakma CHAKMA DANDA
102
+ "𑅂", # U+11142 BC=L BLK=Chakma SC=Chakma CHAKMA DOUBLE DANDA
103
+ "𑅃", # U+11143 BC=L BLK=Chakma SC=Chakma CHAKMA QUESTION MARK
104
+ "𑇅", # U+111C5 BC=L BLK=Sharada SC=Sharada SHARADA DANDA
105
+ "𑇆", # U+111C6 BC=L BLK=Sharada SC=Sharada SHARADA DOUBLE DANDA
106
+ "𑇍", # U+111CD BC=L BLK=Sharada SC=Sharada SHARADA SUTRA MARK
107
+ "𑇞", # U+111DE BC=L BLK=Sharada SC=Sharada SHARADA SECTION MARK-1
108
+ "𑇟", # U+111DF BC=L BLK=Sharada SC=Sharada SHARADA SECTION MARK-2
109
+ "𑈸", # U+11238 BC=L BLK=Khojki SC=Khojki KHOJKI DANDA
110
+ "𑈹", # U+11239 BC=L BLK=Khojki SC=Khojki KHOJKI DOUBLE DANDA
111
+ "𑈻", # U+1123B BC=L BLK=Khojki SC=Khojki KHOJKI SECTION MARK
112
+ "𑈼", # U+1123C BC=L BLK=Khojki SC=Khojki KHOJKI DOUBLE SECTION MARK
113
+ "𑊩", # U+112A9 BC=L BLK=Multani SC=Multani MULTANI SECTION MARK
114
+ "𑑋", # U+1144B BC=L BLK=Newa SC=Newa NEWA DANDA
115
+ "𑑌", # U+1144C BC=L BLK=Newa SC=Newa NEWA DOUBLE DANDA
116
+ "𑗂", # U+115C2 BC=L BLK=Siddham SC=Siddham SIDDHAM DANDA
117
+ "𑗃", # U+115C3 BC=L BLK=Siddham SC=Siddham SIDDHAM DOUBLE DANDA
118
+ "𑗉", # U+115C9 BC=L BLK=Siddham SC=Siddham SIDDHAM END OF TEXT MARK
119
+ "𑗊", # U+115CA BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIDENT AND U-SHAPED ORNAMENTS
120
+ "𑗋", # U+115CB BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIDENT AND DOTTED CRESCENTS
121
+ "𑗌", # U+115CC BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED CRESCENTS
122
+ "𑗍", # U+115CD BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED DOUBLE CRESCENTS
123
+ "𑗎", # U+115CE BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED TRIPLE CRESCENTS
124
+ "𑗏", # U+115CF BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK DOUBLE RING
125
+ "𑗐", # U+115D0 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK DOUBLE RING WITH RAYS
126
+ "𑗑", # U+115D1 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH DOUBLE CRESCENTS
127
+ "𑗒", # U+115D2 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIPLE CRESCENTS
128
+ "𑗓", # U+115D3 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH QUADRUPLE CRESCENTS
129
+ "𑗔", # U+115D4 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH SEPTUPLE CRESCENTS
130
+ "𑗕", # U+115D5 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND RAYS
131
+ "𑗖", # U+115D6 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND TWO ENCLOSURES
132
+ "𑗗", # U+115D7 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
133
+ "𑙁", # U+11641 BC=L BLK=Modi SC=Modi MODI DANDA
134
+ "𑙂", # U+11642 BC=L BLK=Modi SC=Modi MODI DOUBLE DANDA
135
+ "𑜼", # U+1173C BC=L BLK=Ahom SC=Ahom AHOM SIGN SMALL SECTION
136
+ "𑜽", # U+1173D BC=L BLK=Ahom SC=Ahom AHOM SIGN SECTION
137
+ "𑜾", # U+1173E BC=L BLK=Ahom SC=Ahom AHOM SIGN RULAI
138
+ "𑥄", # U+11944 BC=L BLK=Dives_Akuru SC=Dives_Akuru DIVES AKURU DOUBLE DANDA
139
+ "𑥆", # U+11946 BC=L BLK=Dives_Akuru SC=Dives_Akuru DIVES AKURU END OF TEXT MARK
140
+ "𑩂", # U+11A42 BC=L BLK=Zanabazar_Square SC=Zanabazar_Square ZANABAZAR SQUARE MARK SHAD
141
+ "𑩃", # U+11A43 BC=L BLK=Zanabazar_Square SC=Zanabazar_Square ZANABAZAR SQUARE MARK DOUBLE SHAD
142
+ "𑪛", # U+11A9B BC=L BLK=Soyombo SC=Soyombo SOYOMBO MARK SHAD
143
+ "𑪜", # U+11A9C BC=L BLK=Soyombo SC=Soyombo SOYOMBO MARK DOUBLE SHAD
144
+ "𑱁", # U+11C41 BC=L BLK=Bhaiksuki SC=Bhaiksuki BHAIKSUKI DANDA
145
+ "𑱂", # U+11C42 BC=L BLK=Bhaiksuki SC=Bhaiksuki BHAIKSUKI DOUBLE DANDA
146
+ "𑻷", # U+11EF7 BC=L BLK=Makasar SC=Makasar MAKASAR PASSIMBANG
147
+ "𑻸", # U+11EF8 BC=L BLK=Makasar SC=Makasar MAKASAR END OF SECTION
148
+ "𑽃", # U+11F43 BC=L BLK=Kawi SC=Kawi KAWI DANDA
149
+ "𑽄", # U+11F44 BC=L BLK=Kawi SC=Kawi KAWI DOUBLE DANDA
150
+ "𖩮", # U+16A6E BC=L BLK=Mro SC=Mro MRO DANDA
151
+ "𖩯", # U+16A6F BC=L BLK=Mro SC=Mro MRO DOUBLE DANDA
152
+ "𖫵", # U+16AF5 BC=L BLK=Bassa_Vah SC=Bassa_Vah BASSA VAH FULL STOP
153
+ "𖬷", # U+16B37 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN VOS THOM
154
+ "𖬸", # U+16B38 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN VOS TSHAB CEEB
155
+ "𖭄", # U+16B44 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN XAUS
156
+ "𖺘", # U+16E98 BC=L BLK=Medefaidrin SC=Medefaidrin MEDEFAIDRIN FULL STOP
157
+ "𛲟", # U+1BC9F BC=L BLK=Duployan SC=Duployan DUPLOYAN PUNCTUATION CHINOOK FULL STOP
158
+ "𝪈", # U+1DA88 BC=L BLK=Sutton_SignWriting SC=SignWriting SIGNWRITING FULL STOP
159
+ ]
160
+ + [
161
+ # Additional manual entries.
162
+ "...", # U+2026 HORIZONTAL ELLIPSIS
163
+ "。", # U+3002 IDEOGRAPHIC FULL STOP
164
+ ]
165
+ )
@@ -9,7 +9,7 @@ tests = [
9
9
  ),
10
10
  (
11
11
  "ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።",
12
- ["ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻ", "ርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።"],
12
+ ["ቴዎድሮስ ጥር ፮ ቀን ፲፰፻፲፩ ዓ.ም. ሻርጌ በተባለ ቦታ ቋራ ውስጥ፣ ከጎንደር ከተማ በስተ ምዕራብ ተወለዱ።"],
13
13
  ),
14
14
  ]
15
15
 
@@ -1,143 +0,0 @@
1
- # unicode code points with the \p{Sentence_Break=STerm} or \p{Sentence_Break=ATerm} properties that
2
- # also have the \p{Terminal_Punctuation} property generated with Unicode::Tussle perl script and
3
- # additional fullstops in unicode character sets : https://www.fileformat.info/info/unicode/char/search.htm?q=.&
4
- # preview=entity
5
- GLOBAL_SENTENCE_TERMINATORS = [
6
- "...", # Horizontal Ellipsis
7
- "!", # Exclamation Mark
8
- ".", # Full Stop
9
- "?", # Question Mark
10
- "։", # Armenian Full Stop
11
- "؞", # Arabic Sign Sallallahou Alayhe Wasallam
12
- "؟", # Arabic Question Mark
13
- "۔", # Arabic Full Stop
14
- "܀", # Syriac End of Paragraph
15
- "܁", # Syriac Supralinear Colon
16
- "܂", # Syriac Sublinear Colon
17
- "߹", # Nko Symbol Doorye
18
- "࠷", # Samarkan Letter Do
19
- "࠹", # Samarkan Letter Jho
20
- "࠽", # Samarkan Letter Ro
21
- "࠾", # Samarkan Letter Lo
22
- "।", # Devanagari Danda
23
- "॥", # Devanagari Double Danda
24
- "၊", # Myanmar Sign Myanmar Phrase Stop
25
- "။", # Myanmar Sign Myanmar Paragraph
26
- "።", # Ethiopic Full Stop
27
- "፧", # Ethiopic Colon
28
- "፨", # Ethiopic Preface Colon
29
- "᙮", # Ethiopic Question Mark
30
- "᜵", # Buginese Vowel Sign E
31
- "᜶", # Buginese Vowel Sign O
32
- "᠃", # Mongolian Full Stop
33
- "᠉", # Mongolian Birga
34
- "᥄", # Buhid Virama
35
- "᥅", # Buhid Punctuation Mark
36
- "᪨", # Tai Tham Consonant Sign Medial Ra
37
- "᪩", # Tai Tham Consonant Sign Medial La
38
- "᪪", # Tai Tham Consonant Sign La Taa
39
- "᪫", # Tai Tham Sign Mai Sak
40
- "᭚", # Balinese Pameneng
41
- "᭛", # Balinese Musical Symbol Combining Jublag
42
- "᭞", # Sundanese Padasan Agung
43
- "᭟", # Sundanese Paneken
44
- "᰻", # Buhid Pamudpod
45
- "᰼", # Buhid Pamudpod Han
46
- "᱾", # Limbu Question Mark
47
- "᱿", # Limbu Exclamation Mark
48
- "‼", # Double Exclamation Mark
49
- "‽", # Interrobang
50
- "⁇", # Double Question Mark
51
- "⁈", # Question Exclamation Mark
52
- "⁉", # Exclamation Question Mark
53
- "⸮", # Reversed Question Mark
54
- "⸼", # Armenian Parenthesis Right
55
- "꓿", # Yi Punctuation Small Comma
56
- "꘎", # Vai Comma
57
- "꘏", # Vai Full Stop
58
- "꛳", # Batak Apostrophe
59
- "꛷", # Batak Pangolat
60
- "꡶", # Lanna Punctation Phrase
61
- "꡷", # Lanna Punctation Paragraph
62
- "꣎", # Ol Chiki Punctuation Mucaad
63
- "꣏", # Ol Chiki Punctuation Double
64
- "꤯", # Chakma Sign Visarga
65
- "꧈", # Balinese Musical Symbol Left-Hand Open Dug
66
- "꧉", # Balinese Musical Symbol Right-Hand Open Dug
67
- "꩝", # Cham Consonant Sign Final H
68
- "꩞", # Cham Consonant Sign Glottal Stop
69
- "꩟", # Cham Consonant Sign M
70
- "꫰", # Tai Viet Mai Khit
71
- "꫱", # Tai Viet Vowel Ia
72
- "꯫", # Meetei Mayek Cheikhei
73
- "﹒", # Small Full Stop
74
- "﹖", # Small Question Mark
75
- "﹗", # Small Exclamation Mark
76
- "!", # Fullwidth Exclamation Mark
77
- ".", # Fullwidth Full Stop
78
- "?", # Fullwidth Question Mark
79
- "ཕ", # Tibetan Letter Pha
80
- "བ", # Tibetan Letter Ba
81
- "བྷ", # Tibetan Letter Bha
82
- "མ", # Tibetan Letter Ma
83
- "ཙ", # Tibetan Letter Tsa
84
- "၇", # Myanmar Digit Seven
85
- "၈", # Myanmar Digit Eight
86
- "Ⴞ", # Georgian Letter Har
87
- "Ⴟ", # Georgian Letter Hae
88
- "Ⴠ", # Georgian Letter Hoe
89
- "Ⴡ", # Georgian Letter Yu
90
- "ᅁ", # Hangul Letter Yeorin Hieuh
91
- "ᅂ", # Hangul Letter Yeorin Simeum
92
- "ᅃ", # Hangul Letter Yeorin Cieuc
93
- "ᇅ", # Hangul Letter Phieuph-Pieup
94
- "ᇆ", # Hangul Letter Kapyeounphieuph
95
- "ᇍ", # Hangul Letter Kapyeounhieuh
96
- "ᇞ", # Hangul Letter Yang-Hieuh
97
- "ᇟ", # Hangul Letter Yo-Yae
98
- "ሸ", # Ethiopic Syllable Shee
99
- "ሹ", # Ethiopic Syllable Shuu
100
- "ሻ", # Ethiopic Syllable Shaa
101
- "ሼ", # Ethiopic Syllable She
102
- "ኩ", # Ethiopic Syllable Ku
103
- "ᑋ", # Canadian Syllabics We
104
- "ᑌ", # Canadian Syllabics West-Cree Pa
105
- "ᗂ", # Canadian Syllabics South Slavey Lo
106
- "ᗃ", # Canadian Syllabics South Slavey Lu
107
- "ᗉ", # Canadian Syllabics Carrier Syllabic Yay
108
- "ᗊ", # Canadian Syllabics Carrier Syllabic Yaa
109
- "ᗋ", # Canadian Syllabics Carrier Syllabic Ywe
110
- "ᗌ", # Canadian Syllabics Carrier Syllabic Ywi
111
- "ᗍ", # Canadian Syllabics Carrier Syllabic Ywii
112
- "ᗎ", # Canadian Syllabics Carrier Syllabic Ywo
113
- "ᗏ", # Canadian Syllabics Carrier Syllabic Ywoo
114
- "ᗐ", # Canadian Syllabics Carrier Syllabic Ywi
115
- "ᗗ", # Canadian Syllabics Cree-Cha
116
- "ᙁ", # Canadian Syllabics Slavey She
117
- "ᙂ", # Canadian Syllabics Chipewyan Ga
118
- "᥄", # Ethiopic Syllable Gwa
119
- "᥆", # Ethiopic Syllable Gwo
120
- "ᩂ", # Tai Tham Consonant Sign Low Ha
121
- "ᩃ", # Tai Tham Consonant Sign High Ha
122
- "᱁", # Ethiopic Syllable Hoa
123
- "᱂", # Ethiopic Syllable Hoa
124
- "ỷ", # Latin Small Letter Y With Tilde
125
- "Ỹ", # Latin Capital Letter Y With Tilde
126
- "橮", # CJK Unified Ideograph-6AEE
127
- "橯", # CJK Unified Ideograph-6AEF
128
- "櫵", # CJK Unified Ideograph-6AF5
129
- "欷", # CJK Unified Ideograph-6B37
130
- "欸", # CJK Unified Ideograph-6B38
131
- "歄", # CJK Unified Ideograph-6B84
132
- "溘", # CJK Unified Ideograph-6E98
133
- "벟", # Hangul Syllable Eq
134
- "⳹", # Greek Small Letter Ous
135
- "⳾", # Greek Small Letter Psi
136
- "。", # Ideographic Full Stop
137
- "︒", # Presentation Form For Vertical Ideographic Full Stop
138
- "。", # Halfwidth Katakana Middle Dot
139
- "𖫵", # Mongolian Vowel Separator
140
- "𖺘", # Mongolian Letter Ali Gali U
141
- "𛲟", # Hanifi Rohingya Sign Harbahay
142
- "𝪈", # Mathematical Bold Capital U
143
- ] # 150 symbols
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes