sentencex 1.0.4__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentencex might be problematic. Click here for more details.

Files changed (125) hide show
  1. {sentencex-1.0.4 → sentencex-1.0.5}/Cargo.lock +4 -4
  2. {sentencex-1.0.4 → sentencex-1.0.5}/PKG-INFO +1 -1
  3. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/Cargo.toml +1 -1
  4. sentencex-1.0.5/patient.txt +4 -0
  5. {sentencex-1.0.4 → sentencex-1.0.5}/pyproject.toml +1 -1
  6. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/de.rs +0 -4
  7. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/it.rs +0 -4
  8. sentencex-1.0.5/src/languages/language.rs +415 -0
  9. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/ta.rs +0 -4
  10. {sentencex-1.0.4 → sentencex-1.0.5}/src/lib.rs +2 -17
  11. {sentencex-1.0.4 → sentencex-1.0.5}/tests/en.txt +14 -1
  12. sentencex-1.0.4/src/languages/language.rs +0 -288
  13. {sentencex-1.0.4 → sentencex-1.0.5}/.github/workflows/node.yaml +0 -0
  14. {sentencex-1.0.4 → sentencex-1.0.5}/.github/workflows/python.yaml +0 -0
  15. {sentencex-1.0.4 → sentencex-1.0.5}/.github/workflows/rust.yml +0 -0
  16. {sentencex-1.0.4 → sentencex-1.0.5}/.github/workflows/wasm.yaml +0 -0
  17. {sentencex-1.0.4 → sentencex-1.0.5}/.gitignore +0 -0
  18. {sentencex-1.0.4 → sentencex-1.0.5}/100-0.txt +0 -0
  19. {sentencex-1.0.4 → sentencex-1.0.5}/11-0.txt +0 -0
  20. {sentencex-1.0.4 → sentencex-1.0.5}/1661-0.txt +0 -0
  21. {sentencex-1.0.4 → sentencex-1.0.5}/LICENSE +0 -0
  22. {sentencex-1.0.4 → sentencex-1.0.5}/README.md +0 -0
  23. {sentencex-1.0.4 → sentencex-1.0.5}/TODO.md +0 -0
  24. {sentencex-1.0.4 → sentencex-1.0.5}/benches/segment_benchmark.rs +0 -0
  25. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/.gitignore +0 -0
  26. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/.python-version +0 -0
  27. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/Cargo.lock +0 -0
  28. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/README.md +0 -0
  29. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/example.py +0 -0
  30. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/publish.sh +0 -0
  31. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/src/lib.rs +0 -0
  32. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/tests/__init__.py +0 -0
  33. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/tests/test_sentencex.py +0 -0
  34. {sentencex-1.0.4 → sentencex-1.0.5}/bindings/python/uv.lock +0 -0
  35. {sentencex-1.0.4 → sentencex-1.0.5}/demo/index.html +0 -0
  36. {sentencex-1.0.4 → sentencex-1.0.5}/examples/rust_example.rs +0 -0
  37. {sentencex-1.0.4 → sentencex-1.0.5}/oxygen.txt +0 -0
  38. {sentencex-1.0.4 → sentencex-1.0.5}/paris.txt +0 -0
  39. {sentencex-1.0.4 → sentencex-1.0.5}/src/constants.rs +0 -0
  40. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/am.txt +0 -0
  41. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/ar.txt +0 -0
  42. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/bg.txt +0 -0
  43. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/bn.txt +0 -0
  44. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/da.txt +0 -0
  45. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/de.txt +0 -0
  46. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/el.txt +0 -0
  47. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/en.txt +0 -0
  48. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/es.txt +0 -0
  49. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/fi.txt +0 -0
  50. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/fr.txt +0 -0
  51. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/gu.txt +0 -0
  52. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/hi.txt +0 -0
  53. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/it.txt +0 -0
  54. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/kk.txt +0 -0
  55. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/kn.txt +0 -0
  56. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/ml.txt +0 -0
  57. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/nl.txt +0 -0
  58. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/pa.txt +0 -0
  59. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/pl.txt +0 -0
  60. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/pt.txt +0 -0
  61. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/ru.txt +0 -0
  62. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/sk.txt +0 -0
  63. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/ta.txt +0 -0
  64. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/abbrev/te.txt +0 -0
  65. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/am.rs +0 -0
  66. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/ar.rs +0 -0
  67. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/bg.rs +0 -0
  68. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/bn.rs +0 -0
  69. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/ca.rs +0 -0
  70. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/da.rs +0 -0
  71. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/el.rs +0 -0
  72. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/en.rs +0 -0
  73. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/es.rs +0 -0
  74. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/fallbacks.yaml +0 -0
  75. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/fi.rs +0 -0
  76. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/fr.rs +0 -0
  77. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/gu.rs +0 -0
  78. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/hi.rs +0 -0
  79. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/hy.rs +0 -0
  80. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/ja.rs +0 -0
  81. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/kk.rs +0 -0
  82. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/kn.rs +0 -0
  83. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/ml.rs +0 -0
  84. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/mod.rs +0 -0
  85. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/mr.rs +0 -0
  86. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/my.rs +0 -0
  87. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/nl.rs +0 -0
  88. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/pa.rs +0 -0
  89. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/pl.rs +0 -0
  90. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/pt.rs +0 -0
  91. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/ru.rs +0 -0
  92. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/sk.rs +0 -0
  93. {sentencex-1.0.4 → sentencex-1.0.5}/src/languages/te.rs +0 -0
  94. {sentencex-1.0.4 → sentencex-1.0.5}/src/main.rs +0 -0
  95. {sentencex-1.0.4 → sentencex-1.0.5}/tests/am.txt +0 -0
  96. {sentencex-1.0.4 → sentencex-1.0.5}/tests/ar.txt +0 -0
  97. {sentencex-1.0.4 → sentencex-1.0.5}/tests/bg.txt +0 -0
  98. {sentencex-1.0.4 → sentencex-1.0.5}/tests/bn.txt +0 -0
  99. {sentencex-1.0.4 → sentencex-1.0.5}/tests/ca.txt +0 -0
  100. {sentencex-1.0.4 → sentencex-1.0.5}/tests/da.txt +0 -0
  101. {sentencex-1.0.4 → sentencex-1.0.5}/tests/de.txt +0 -0
  102. {sentencex-1.0.4 → sentencex-1.0.5}/tests/el.txt +0 -0
  103. {sentencex-1.0.4 → sentencex-1.0.5}/tests/es.txt +0 -0
  104. {sentencex-1.0.4 → sentencex-1.0.5}/tests/fi.txt +0 -0
  105. {sentencex-1.0.4 → sentencex-1.0.5}/tests/fr.txt +0 -0
  106. {sentencex-1.0.4 → sentencex-1.0.5}/tests/gu.txt +0 -0
  107. {sentencex-1.0.4 → sentencex-1.0.5}/tests/hi.txt +0 -0
  108. {sentencex-1.0.4 → sentencex-1.0.5}/tests/hy.txt +0 -0
  109. {sentencex-1.0.4 → sentencex-1.0.5}/tests/it.txt +0 -0
  110. {sentencex-1.0.4 → sentencex-1.0.5}/tests/ja.txt +0 -0
  111. {sentencex-1.0.4 → sentencex-1.0.5}/tests/kk.txt +0 -0
  112. {sentencex-1.0.4 → sentencex-1.0.5}/tests/kn.txt +0 -0
  113. {sentencex-1.0.4 → sentencex-1.0.5}/tests/ml.txt +0 -0
  114. {sentencex-1.0.4 → sentencex-1.0.5}/tests/mr.txt +0 -0
  115. {sentencex-1.0.4 → sentencex-1.0.5}/tests/my.txt +0 -0
  116. {sentencex-1.0.4 → sentencex-1.0.5}/tests/nl.txt +0 -0
  117. {sentencex-1.0.4 → sentencex-1.0.5}/tests/pa.txt +0 -0
  118. {sentencex-1.0.4 → sentencex-1.0.5}/tests/pl.txt +0 -0
  119. {sentencex-1.0.4 → sentencex-1.0.5}/tests/pt.txt +0 -0
  120. {sentencex-1.0.4 → sentencex-1.0.5}/tests/ru.txt +0 -0
  121. {sentencex-1.0.4 → sentencex-1.0.5}/tests/sk.txt +0 -0
  122. {sentencex-1.0.4 → sentencex-1.0.5}/tests/ta.txt +0 -0
  123. {sentencex-1.0.4 → sentencex-1.0.5}/tests/te.txt +0 -0
  124. {sentencex-1.0.4 → sentencex-1.0.5}/tests/ur.txt +0 -0
  125. {sentencex-1.0.4 → sentencex-1.0.5}/tests/zh.txt +0 -0
@@ -653,7 +653,7 @@ checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73"
653
653
 
654
654
  [[package]]
655
655
  name = "sentencex"
656
- version = "0.1.4"
656
+ version = "0.1.5"
657
657
  dependencies = [
658
658
  "clap",
659
659
  "criterion",
@@ -666,7 +666,7 @@ dependencies = [
666
666
 
667
667
  [[package]]
668
668
  name = "sentencex-js"
669
- version = "1.0.4"
669
+ version = "1.0.5"
670
670
  dependencies = [
671
671
  "neon",
672
672
  "neon-build",
@@ -675,7 +675,7 @@ dependencies = [
675
675
 
676
676
  [[package]]
677
677
  name = "sentencex-py"
678
- version = "0.1.4"
678
+ version = "0.1.5"
679
679
  dependencies = [
680
680
  "pyo3",
681
681
  "sentencex",
@@ -683,7 +683,7 @@ dependencies = [
683
683
 
684
684
  [[package]]
685
685
  name = "sentencex-wasm"
686
- version = "0.1.4"
686
+ version = "0.1.5"
687
687
  dependencies = [
688
688
  "sentencex",
689
689
  "serde",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sentencex
3
- Version: 1.0.4
3
+ Version: 1.0.5
4
4
  Classifier: Intended Audience :: Developers
5
5
  Classifier: Intended Audience :: Science/Research
6
6
  Classifier: Topic :: Text Processing
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "sentencex-py"
3
- version = "0.1.4"
3
+ version = "0.1.5"
4
4
  edition = "2024"
5
5
  description = "Sentence segmentation library with wide language support optimized for speed and utility."
6
6
  authors = ["Santhosh Thottingal <santhosh.thottingal@gmail.com>"]
@@ -0,0 +1,4 @@
1
+ Dr. Smith, who arrived at 3:00 p.m., said, "The patient's condition is stable; however, further tests (e.g., MRI, CT scan) are required.". Meanwhile, the nurse—who had been on duty since 7:00 a.m.—noted that the patient's vitals (BP: 120/80, HR: 75) were within normal limits. "Let's proceed with the tests ASAP," she added.
2
+
3
+
4
+ Hello!?..!..!
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sentencex"
3
- version = "1.0.4"
3
+ version = "1.0.5"
4
4
  requires-python = ">=3.10"
5
5
  description = "Sentence segmenter that supports ~300 languages"
6
6
  authors = [{name = "Santhosh Thottingal", email = "santhosh.thottingal@gmail.com"}]
@@ -75,10 +75,6 @@ impl Language for Deutch {
75
75
 
76
76
  false
77
77
  }
78
-
79
- fn is_punctuation_between_quotes(&self) -> bool {
80
- true
81
- }
82
78
  }
83
79
 
84
80
  #[cfg(test)]
@@ -17,10 +17,6 @@ impl Language for Italian {
17
17
  &ITALIAN_ABBREVIATIONS
18
18
  }
19
19
 
20
- fn is_punctuation_between_quotes(&self) -> bool {
21
- false
22
- }
23
-
24
20
  fn get_last_word<'a>(&self, text: &'a str) -> &'a str {
25
21
  let words: Vec<&str> = text
26
22
  .split(|c: char| c.is_whitespace() || c == '.')
@@ -0,0 +1,415 @@
1
+ use regex::Regex;
2
+ use std::collections::HashMap;
3
+ use std::sync::Mutex;
4
+
5
+ use crate::SentenceBoundary;
6
+ use crate::constants::EMAIL_REGEX;
7
+ use crate::constants::EXCLAMATION_WORDS;
8
+ use crate::constants::GLOBAL_SENTENCE_TERMINATORS;
9
+ use crate::constants::PARENS_REGEX;
10
+ use crate::constants::QUOTES_REGEX;
11
+
12
+ lazy_static::lazy_static! {
13
+ static ref SENTENCE_BREAK_REGEX_CACHE: Mutex<HashMap<String, Regex>> = Mutex::new(HashMap::new());
14
+ static ref CONTINUE_REGEX: Regex = Regex::new(r"^[0-9a-z]").unwrap();
15
+ }
16
+
17
+ #[derive(Debug, Clone, Copy, PartialEq)]
18
+ pub enum SkippableRangeType {
19
+ Quote,
20
+ Parentheses,
21
+ Email,
22
+ }
23
+
24
+ #[derive(Debug, Clone, Copy)]
25
+ pub struct SkippableRange {
26
+ pub start: usize,
27
+ pub end: usize,
28
+ pub range_type: SkippableRangeType,
29
+ }
30
+
31
+ impl SkippableRange {
32
+ pub fn new(start: usize, end: usize, range_type: SkippableRangeType) -> Self {
33
+ Self {
34
+ start,
35
+ end,
36
+ range_type,
37
+ }
38
+ }
39
+
40
+ pub fn contains(&self, position: usize) -> bool {
41
+ position > self.start && position < self.end
42
+ }
43
+
44
+ pub fn is_quote(&self) -> bool {
45
+ self.range_type == SkippableRangeType::Quote
46
+ }
47
+ }
48
+
49
+ pub trait Language {
50
+ /// Returns a compiled regex pattern that matches sentence terminating punctuation.
51
+ /// This regex is used to identify potential sentence boundaries in text.
52
+ /// The pattern is cached for performance and includes all global sentence terminators
53
+ /// like periods, exclamation marks, and question marks.
54
+ fn get_sentence_break_regex(&self) -> Regex {
55
+ let pattern = format!("[{}]+", GLOBAL_SENTENCE_TERMINATORS.join(""));
56
+
57
+ // Try to get from cache first
58
+ {
59
+ let cache = SENTENCE_BREAK_REGEX_CACHE.lock().unwrap();
60
+ if let Some(regex) = cache.get(&pattern) {
61
+ return regex.clone();
62
+ }
63
+ }
64
+
65
+ // Create new regex and cache it
66
+ let regex = Regex::new(&pattern).unwrap();
67
+ {
68
+ let mut cache = SENTENCE_BREAK_REGEX_CACHE.lock().unwrap();
69
+ cache.insert(pattern, regex.clone());
70
+ }
71
+
72
+ regex
73
+ }
74
+
75
+ /// Analyzes the input text and returns a vector of sentence boundaries.
76
+ /// This is the main method for sentence segmentation that:
77
+ /// 1. Splits text into paragraphs at double newlines
78
+ /// 2. Identifies potential sentence breaks using regex patterns
79
+ /// 3. Filters out false positives (abbreviations, quotes, etc.)
80
+ /// 4. Returns structured boundary information including start/end positions and boundary symbols
81
+ /// Each boundary contains the sentence text, position indices, and metadata about the boundary type.
82
+ fn get_sentence_boundaries<'a>(&self, text: &'a str) -> Vec<SentenceBoundary<'a>> {
83
+ // Pre-allocate boundaries with estimated capacity (rough estimate: 1 sentence per 50 characters)
84
+ let estimated_sentences = (text.len() / 50).max(1);
85
+ let mut boundaries = Vec::with_capacity(estimated_sentences);
86
+
87
+ // Split by paragraph breaks (one or more newlines with optional whitespace)
88
+ let para_split_re = Regex::new(r"\n[\r]*\n").unwrap();
89
+ let paragraphs: Vec<&str> = para_split_re.split(text).collect();
90
+
91
+ // Pre-calculate all paragraph offsets in one pass
92
+ let mut paragraph_offsets = Vec::with_capacity(paragraphs.len());
93
+ let mut current_offset = 0;
94
+ for (i, paragraph) in paragraphs.iter().enumerate() {
95
+ paragraph_offsets.push(current_offset);
96
+ current_offset += paragraph.len();
97
+ if i < paragraphs.len() - 1 {
98
+ current_offset += 2; // for "\n\n"
99
+ }
100
+ }
101
+
102
+ // Pre-allocate sentence_boundaries once and reuse for all paragraphs
103
+ let estimated_paragraph_sentences = 10; // reasonable default for typical paragraphs
104
+ let mut sentence_boundaries = Vec::with_capacity(estimated_paragraph_sentences);
105
+
106
+ for (pindex, paragraph) in paragraphs.iter().enumerate() {
107
+ if pindex > 0 {
108
+ let paragraph_start = paragraph_offsets[pindex];
109
+ boundaries.push(SentenceBoundary {
110
+ start_index: paragraph_start,
111
+ end_index: paragraph_start + 2,
112
+ text: "\n\n",
113
+ boundary_symbol: None,
114
+ is_paragraph_break: true,
115
+ });
116
+ }
117
+
118
+ let paragraph_start_offset = if pindex == 0 {
119
+ 0
120
+ } else {
121
+ paragraph_offsets[pindex] + 2
122
+ };
123
+ sentence_boundaries.clear();
124
+ sentence_boundaries.push(0);
125
+
126
+ let matches: Vec<(usize, usize)> = self
127
+ .get_sentence_break_regex()
128
+ .find_iter(paragraph)
129
+ .map(|m| (m.start(), m.end()))
130
+ .collect();
131
+ let skippable_ranges = self.get_skippable_ranges(paragraph);
132
+
133
+ for (start, end) in matches {
134
+ let mut boundary = self
135
+ .find_boundary(paragraph, start, end)
136
+ .unwrap_or(usize::MAX);
137
+
138
+ if boundary == usize::MAX {
139
+ continue;
140
+ }
141
+
142
+ let mut in_range = false;
143
+
144
+ for range in &skippable_ranges {
145
+ if range.contains(boundary) {
146
+ let next_word = self.get_next_word_approx(text, range.end);
147
+ let boundary_extend = self.get_boundary_extend(next_word);
148
+ if range.is_quote() && boundary + 1 == range.end && boundary_extend >= 0 {
149
+ boundary = range.end + boundary_extend as usize;
150
+ in_range = false;
151
+ } else {
152
+ in_range = true;
153
+ }
154
+ break;
155
+ }
156
+ }
157
+
158
+ if in_range {
159
+ continue;
160
+ }
161
+
162
+ sentence_boundaries.push(boundary);
163
+ }
164
+
165
+ if *sentence_boundaries.last().unwrap() != paragraph.len() {
166
+ sentence_boundaries.push(paragraph.len());
167
+ }
168
+
169
+ for i in 0..sentence_boundaries.len() - 1 {
170
+ let start = sentence_boundaries[i];
171
+ let end = sentence_boundaries[i + 1];
172
+
173
+ if start >= paragraph.len() || end > paragraph.len() || start > end {
174
+ continue;
175
+ }
176
+
177
+ let sentence_text = &paragraph[start..end];
178
+ let boundary_symbol = if end > 0 && end <= paragraph.len() {
179
+ // Use char_indices for more efficient character iteration
180
+ paragraph[..end]
181
+ .char_indices()
182
+ .next_back()
183
+ .and_then(|(idx, _)| {
184
+ let char_str = &paragraph[idx..end];
185
+ if GLOBAL_SENTENCE_TERMINATORS.contains(&char_str) {
186
+ Some(char_str.to_string())
187
+ } else {
188
+ None
189
+ }
190
+ })
191
+ } else {
192
+ None
193
+ };
194
+
195
+ boundaries.push(SentenceBoundary {
196
+ start_index: paragraph_start_offset + start,
197
+ end_index: paragraph_start_offset + end,
198
+ text: sentence_text,
199
+ boundary_symbol,
200
+ is_paragraph_break: false,
201
+ });
202
+ }
203
+ }
204
+
205
+ boundaries
206
+ }
207
+
208
+ /// Segments the input text into individual sentences and returns them as string slices.
209
+ /// This is a convenience method that builds on get_sentence_boundaries() but returns
210
+ /// only the sentence text content without the additional boundary metadata.
211
+ /// Used when you only need the segmented sentences and not their position information.
212
+ fn segment<'a>(&self, text: &'a str) -> Vec<&'a str> {
213
+ // Pre-allocate with estimated capacity based on text length
214
+ let estimated_sentences = (text.len() / 50).max(1);
215
+ let mut sentences = Vec::with_capacity(estimated_sentences);
216
+
217
+ let boundaries = self.get_sentence_boundaries(text);
218
+ for boundary in boundaries {
219
+ if !boundary.text.is_empty() {
220
+ sentences.push(boundary.text);
221
+ }
222
+ }
223
+
224
+ sentences
225
+ }
226
+
227
+ /// Returns the character used to mark abbreviations in this language.
228
+ /// By default returns "." (period), but should be overridden by specific languages
229
+ /// that use different abbreviation markers. Used by the abbreviation detection logic
230
+ /// to determine if a potential sentence boundary is actually an abbreviation.
231
+ fn get_abbreviation_char(&self) -> &str {
232
+ "."
233
+ }
234
+
235
+ /// Returns a list of known abbreviations for this language.
236
+ /// These are used to prevent false sentence breaks at abbreviation periods.
237
+ /// For example, "Dr." or "etc." should not trigger a sentence boundary.
238
+ /// Languages should override this to provide their specific abbreviation lists.
239
+ /// Returns an empty slice by default.
240
+ fn get_abbreviations(&self) -> &[String] {
241
+ &[]
242
+ }
243
+
244
+ /// Determines how many characters to extend a boundary when continuing into the next word.
245
+ /// Returns -1 if the word indicates the boundary should not be created (continuation case).
246
+ /// Returns 0 or positive number indicating how many whitespace/punctuation characters
247
+ /// to skip when positioning the boundary. Used to handle cases like quoted sentences
248
+ /// where the boundary should include trailing punctuation and whitespace.
249
+ fn get_boundary_extend(&self, word: &str) -> i8 {
250
+ if self.continue_in_next_word(word.trim()) {
251
+ // not a boundary.
252
+ return -1;
253
+ }
254
+
255
+ let mut count = 0i8;
256
+ for ch in word.chars() {
257
+ if ch.is_whitespace() || GLOBAL_SENTENCE_TERMINATORS.contains(&ch.to_string().as_str())
258
+ {
259
+ count += 1;
260
+ if count == i8::MAX {
261
+ break; // Prevent overflow
262
+ }
263
+ } else {
264
+ break;
265
+ }
266
+ }
267
+
268
+ word.ceil_char_boundary(count as usize) as i8
269
+ }
270
+
271
+ /// Checks if a potential sentence boundary is actually part of an abbreviation.
272
+ /// Examines the text before the separator to see if it ends with a known abbreviation.
273
+ /// Returns true if this appears to be an abbreviation (and thus not a sentence boundary),
274
+ /// false if it's likely a genuine sentence end. Used to prevent breaking sentences
275
+ /// at abbreviations like "Dr. Smith" or "etc."
276
+ fn is_abbreviation(&self, head: &str, _tail: &str, separator: &str) -> bool {
277
+ if self.get_abbreviation_char() != separator {
278
+ return false;
279
+ }
280
+
281
+ let last_word = self.get_last_word(head);
282
+
283
+ if last_word.is_empty() {
284
+ return false;
285
+ }
286
+
287
+ let abbreviations = self.get_abbreviations();
288
+ let is_abbrev = abbreviations.contains(&last_word.to_string());
289
+ let is_abbrev_lower = abbreviations.contains(&last_word.to_lowercase());
290
+ let is_abbrev_upper = abbreviations.contains(&last_word.to_uppercase());
291
+
292
+ is_abbrev || is_abbrev_lower || is_abbrev_upper
293
+ }
294
+
295
+ /// Extracts the last word from the given text by splitting on whitespace and periods.
296
+ /// Used primarily by abbreviation detection to check if the word before a potential
297
+ /// sentence boundary is a known abbreviation. Returns an empty string if no words
298
+ /// are found. This is a performance-optimized version that avoids collecting all words.
299
+ fn get_last_word<'a>(&self, text: &'a str) -> &'a str {
300
+ // Find the last word without collecting all words
301
+ text.split(|c: char| c.is_whitespace() || c == '.')
302
+ .last()
303
+ .unwrap_or("")
304
+ }
305
+
306
+ /// Checks if a potential sentence boundary is actually an exclamation word that shouldn't
307
+ /// trigger a sentence break. Examines the last word before the boundary and checks if
308
+ /// it's in the list of known exclamation words (like "Hey!" or "Wow!").
309
+ /// Returns true if this is an exclamation that should not break the sentence.
310
+ fn is_exclamation(&self, head: &str, _tail: &str) -> bool {
311
+ let last_word = self.get_last_word(head);
312
+ let exclamation_word = format!("{}!", last_word);
313
+ EXCLAMATION_WORDS.contains(&exclamation_word.as_str())
314
+ }
315
+
316
+ /// Returns an approximate substring of the next word(s) starting from the given position.
317
+ /// Limited to a maximum of 30 characters for performance. Used to analyze context
318
+ /// after a potential sentence boundary to determine if the boundary should be created.
319
+ /// Handles UTF-8 character boundaries safely to avoid panics on non-ASCII text.
320
+ fn get_next_word_approx<'a>(&self, text: &'a str, start: usize) -> &'a str {
321
+ if start >= text.len() {
322
+ return "";
323
+ }
324
+
325
+ let max_chars = 30;
326
+ let safe_start = text.floor_char_boundary(start);
327
+ let end_pos = (start + max_chars).min(text.len());
328
+ &text[safe_start..text.floor_char_boundary(end_pos)]
329
+ }
330
+
331
+ /// Analyzes a potential sentence boundary and determines the exact position where
332
+ /// the sentence should end, or returns None if this shouldn't be a boundary.
333
+ /// Considers abbreviations, exclamations, numbered references, and continuation patterns.
334
+ /// This is the core logic that distinguishes true sentence boundaries from false positives
335
+ /// like abbreviations or mid-sentence punctuation.
336
+ fn find_boundary(&self, text: &str, start: usize, end: usize) -> Option<usize> {
337
+ let head = &text[..start];
338
+ let next_index = text.ceil_char_boundary(start + 1);
339
+
340
+ let next_word_approx = self.get_next_word_approx(text, next_index);
341
+
342
+ if let Some(number_ref_match) =
343
+ crate::constants::NUMBERED_REFERENCE_REGEX.find(next_word_approx)
344
+ {
345
+ return Some(next_index + number_ref_match.end());
346
+ }
347
+
348
+ if self.continue_in_next_word(next_word_approx) {
349
+ return None;
350
+ }
351
+
352
+ if self.is_abbreviation(head, next_word_approx, &text[start..end]) {
353
+ return None;
354
+ }
355
+
356
+ if self.is_exclamation(head, next_word_approx) {
357
+ return None;
358
+ }
359
+
360
+ if let Some(space_after_sep_match) =
361
+ crate::constants::SPACE_AFTER_SEPARATOR.find(next_word_approx)
362
+ {
363
+ return Some(next_index + space_after_sep_match.end());
364
+ }
365
+
366
+ Some(end)
367
+ }
368
+
369
+ /// Determines if the text after a potential boundary indicates the sentence should continue.
370
+ /// Returns true if the next word starts with a lowercase letter or number, suggesting
371
+ /// the sentence is continuing rather than starting a new one. This helps avoid breaking
372
+ /// sentences at abbreviations or in the middle of compound sentences.
373
+ fn continue_in_next_word(&self, text_after_boundary: &str) -> bool {
374
+ CONTINUE_REGEX.is_match(text_after_boundary)
375
+ }
376
+
377
+ /// Identifies ranges of text that should be skipped during sentence boundary detection.
378
+ /// This includes quoted text, parenthetical expressions, and email addresses where
379
+ /// internal punctuation should not trigger sentence breaks. Returns a sorted vector
380
+ /// of ranges that can be efficiently checked during boundary detection to avoid
381
+ /// false positives within these special text regions.
382
+ fn get_skippable_ranges(&self, text: &str) -> Vec<SkippableRange> {
383
+ // Pre-allocate with estimated capacity based on text length (rough estimate: 1 range per 200 characters)
384
+ let estimated_ranges = (text.len() / 200).max(1);
385
+ let mut skippable_ranges = Vec::with_capacity(estimated_ranges);
386
+
387
+ for mat in QUOTES_REGEX.find_iter(text) {
388
+ skippable_ranges.push(SkippableRange::new(
389
+ mat.start(),
390
+ mat.end(),
391
+ SkippableRangeType::Quote,
392
+ ));
393
+ }
394
+
395
+ for mat in PARENS_REGEX.find_iter(text) {
396
+ skippable_ranges.push(SkippableRange::new(
397
+ mat.start(),
398
+ mat.end(),
399
+ SkippableRangeType::Parentheses,
400
+ ));
401
+ }
402
+
403
+ for mat in EMAIL_REGEX.find_iter(text) {
404
+ skippable_ranges.push(SkippableRange::new(
405
+ mat.start(),
406
+ mat.end(),
407
+ SkippableRangeType::Email,
408
+ ));
409
+ }
410
+
411
+ // Sort ranges by start position for more efficient lookups
412
+ skippable_ranges.sort_unstable_by_key(|r| r.start);
413
+ skippable_ranges
414
+ }
415
+ }
@@ -35,10 +35,6 @@ impl Language for Tamil {
35
35
  fn get_abbreviations(&self) -> &[String] {
36
36
  &TAMIL_ABBREVIATIONS
37
37
  }
38
-
39
- fn is_punctuation_between_quotes(&self) -> bool {
40
- true
41
- }
42
38
  }
43
39
 
44
40
  #[cfg(test)]
@@ -84,21 +84,6 @@ pub fn language_factory(language_code: &str) -> Box<dyn Language> {
84
84
  }
85
85
  }
86
86
 
87
- /// Find the nearest valid UTF-8 character boundary at or before the given byte index
88
- fn find_char_boundary(text: &str, mut byte_index: usize) -> usize {
89
- // If we're already at or past the end, return text length
90
- if byte_index >= text.len() {
91
- return text.len();
92
- }
93
-
94
- // Walk forwards until we find a valid character boundary
95
- while byte_index < text.len() && !text.is_char_boundary(byte_index) {
96
- byte_index += 1;
97
- }
98
-
99
- byte_index
100
- }
101
-
102
87
  fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
103
88
  if chunk_size == 0 || text.len() <= chunk_size {
104
89
  return vec![text];
@@ -150,7 +135,7 @@ fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
150
135
 
151
136
  if potential_size > chunk_size {
152
137
  // Finalize current chunk
153
- let safe_end = find_char_boundary(text, current_end);
138
+ let safe_end = text.ceil_char_boundary(current_end);
154
139
  chunks.push(&text[current_start..safe_end]);
155
140
 
156
141
  // Start new chunk with current paragraph
@@ -166,7 +151,7 @@ fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
166
151
 
167
152
  // Add the final chunk if there's remaining content
168
153
  if current_start < text.len() {
169
- let safe_end = find_char_boundary(text, current_end);
154
+ let safe_end = text.ceil_char_boundary(current_end);
170
155
  chunks.push(&text[current_start..safe_end]);
171
156
  }
172
157
 
@@ -167,7 +167,8 @@ She turned to him, "This is great." she said.
167
167
  ===
168
168
  She turned to him, "This is great." She held the book out to show him.
169
169
  ---
170
- She turned to him, "This is great." She held the book out to show him.
170
+ She turned to him, "This is great."
171
+ She held the book out to show him.
171
172
  ===
172
173
  Hello!! Long time no see.
173
174
  ---
@@ -313,3 +314,15 @@ Unblanaced bracket with period (. in it will break.
313
314
  ---
314
315
  Unblanaced bracket with period (.
315
316
  in it will break.
317
+ ===
318
+ Dr. Smith, who arrived at 3:00 p.m., said, "The patient's condition is stable; however, further tests (e.g., MRI, CT scan) are required." Meanwhile, the nurse—who had been on duty since 7:00 a.m.—noted that the patient's vitals (BP: 120/80, HR: 75) were within normal limits. "Let's proceed with the tests ASAP," she added.
319
+ ---
320
+ Dr. Smith, who arrived at 3:00 p.m., said, "The patient's condition is stable; however, further tests (e.g., MRI, CT scan) are required."
321
+ Meanwhile, the nurse—who had been on duty since 7:00 a.m.—noted that the patient's vitals (BP: 120/80, HR: 75) were within normal limits.
322
+ "Let's proceed with the tests ASAP," she added.
323
+ ===
324
+ Dr. Smith, who arrived at 3:00 p.m., said, "The patient's condition is stable; however, further tests (e.g., MRI, CT scan) are required.". Meanwhile, the nurse—who had been on duty since 7:00 a.m.—noted that the patient's vitals (BP: 120/80, HR: 75) were within normal limits. "Let's proceed with the tests ASAP," she added.
325
+ ---
326
+ Dr. Smith, who arrived at 3:00 p.m., said, "The patient's condition is stable; however, further tests (e.g., MRI, CT scan) are required.".
327
+ Meanwhile, the nurse—who had been on duty since 7:00 a.m.—noted that the patient's vitals (BP: 120/80, HR: 75) were within normal limits.
328
+ "Let's proceed with the tests ASAP," she added.