sentencex 1.0.2__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentencex might be problematic. Click here for more details.
- {sentencex-1.0.2 → sentencex-1.0.5}/Cargo.lock +4 -4
- {sentencex-1.0.2 → sentencex-1.0.5}/PKG-INFO +1 -1
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/Cargo.toml +1 -1
- sentencex-1.0.5/bindings/python/publish.sh +6 -0
- sentencex-1.0.5/paris.txt +59 -0
- sentencex-1.0.5/patient.txt +4 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/pyproject.toml +1 -1
- {sentencex-1.0.2 → sentencex-1.0.5}/src/constants.rs +1 -1
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/de.rs +0 -4
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/it.rs +0 -4
- sentencex-1.0.5/src/languages/language.rs +415 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/ta.rs +0 -4
- {sentencex-1.0.2 → sentencex-1.0.5}/src/lib.rs +3 -18
- {sentencex-1.0.2 → sentencex-1.0.5}/src/main.rs +1 -1
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/en.txt +19 -1
- sentencex-1.0.2/src/languages/language.rs +0 -289
- {sentencex-1.0.2 → sentencex-1.0.5}/.github/workflows/node.yaml +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/.github/workflows/python.yaml +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/.github/workflows/rust.yml +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/.github/workflows/wasm.yaml +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/.gitignore +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/100-0.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/11-0.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/1661-0.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/LICENSE +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/README.md +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/TODO.md +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/benches/segment_benchmark.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/.gitignore +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/.python-version +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/Cargo.lock +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/README.md +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/example.py +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/src/lib.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/tests/__init__.py +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/tests/test_sentencex.py +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/bindings/python/uv.lock +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/demo/index.html +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/examples/rust_example.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/oxygen.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/am.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/ar.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/bg.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/bn.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/da.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/de.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/el.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/en.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/es.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/fi.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/fr.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/gu.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/hi.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/it.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/kk.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/kn.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/ml.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/nl.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/pa.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/pl.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/pt.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/ru.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/sk.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/ta.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/abbrev/te.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/am.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/ar.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/bg.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/bn.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/ca.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/da.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/el.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/en.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/es.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/fallbacks.yaml +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/fi.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/fr.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/gu.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/hi.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/hy.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/ja.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/kk.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/kn.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/ml.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/mod.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/mr.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/my.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/nl.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/pa.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/pl.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/pt.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/ru.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/sk.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/src/languages/te.rs +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/am.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/ar.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/bg.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/bn.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/ca.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/da.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/de.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/el.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/es.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/fi.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/fr.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/gu.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/hi.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/hy.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/it.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/ja.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/kk.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/kn.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/ml.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/mr.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/my.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/nl.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/pa.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/pl.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/pt.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/ru.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/sk.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/ta.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/te.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/ur.txt +0 -0
- {sentencex-1.0.2 → sentencex-1.0.5}/tests/zh.txt +0 -0
|
@@ -653,7 +653,7 @@ checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73"
|
|
|
653
653
|
|
|
654
654
|
[[package]]
|
|
655
655
|
name = "sentencex"
|
|
656
|
-
version = "0.1.
|
|
656
|
+
version = "0.1.5"
|
|
657
657
|
dependencies = [
|
|
658
658
|
"clap",
|
|
659
659
|
"criterion",
|
|
@@ -666,7 +666,7 @@ dependencies = [
|
|
|
666
666
|
|
|
667
667
|
[[package]]
|
|
668
668
|
name = "sentencex-js"
|
|
669
|
-
version = "1.0.
|
|
669
|
+
version = "1.0.5"
|
|
670
670
|
dependencies = [
|
|
671
671
|
"neon",
|
|
672
672
|
"neon-build",
|
|
@@ -675,7 +675,7 @@ dependencies = [
|
|
|
675
675
|
|
|
676
676
|
[[package]]
|
|
677
677
|
name = "sentencex-py"
|
|
678
|
-
version = "0.1.
|
|
678
|
+
version = "0.1.5"
|
|
679
679
|
dependencies = [
|
|
680
680
|
"pyo3",
|
|
681
681
|
"sentencex",
|
|
@@ -683,7 +683,7 @@ dependencies = [
|
|
|
683
683
|
|
|
684
684
|
[[package]]
|
|
685
685
|
name = "sentencex-wasm"
|
|
686
|
-
version = "0.1.
|
|
686
|
+
version = "0.1.5"
|
|
687
687
|
dependencies = [
|
|
688
688
|
"sentencex",
|
|
689
689
|
"serde",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "sentencex-py"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.5"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
description = "Sentence segmentation library with wide language support optimized for speed and utility."
|
|
6
6
|
authors = ["Santhosh Thottingal <santhosh.thottingal@gmail.com>"]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
|
|
2
|
+
Paris (
|
|
3
|
+
French pronunciation:
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
[paʁi]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
(
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
listen
|
|
21
|
+
|
|
22
|
+
)) Another sentence
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
ends here.
|
|
28
|
+
[
|
|
29
|
+
'\n' +
|
|
30
|
+
'\tParis (\n' +
|
|
31
|
+
'\tFrench pronunciation:\n' +
|
|
32
|
+
'\t\n' +
|
|
33
|
+
'\t\n' +
|
|
34
|
+
'\t\t[paʁi]\n' +
|
|
35
|
+
'\t\n' +
|
|
36
|
+
'\t\n' +
|
|
37
|
+
'\t\t (\n' +
|
|
38
|
+
'\t\t\n' +
|
|
39
|
+
'\t\t\t\n' +
|
|
40
|
+
'\t\t\t\t\n' +
|
|
41
|
+
'\t\t\t\t\t\n' +
|
|
42
|
+
'\t\t\t\t\t\t\n' +
|
|
43
|
+
'\t\t\t\t\t\t\t\n' +
|
|
44
|
+
'\t\t\t\t\t\t\n' +
|
|
45
|
+
'\t\t\t\t\t\n' +
|
|
46
|
+
'\t\t\t\t\t \n' +
|
|
47
|
+
'\t\t\t\t\n' +
|
|
48
|
+
'\t\t\t\tlisten\n' +
|
|
49
|
+
'\t\t\t\n' +
|
|
50
|
+
'\t\t)) Another sentence\n' +
|
|
51
|
+
'\t\n' +
|
|
52
|
+
'\t\t\n' +
|
|
53
|
+
'\t\t\t\n' +
|
|
54
|
+
'\t\t\n' +
|
|
55
|
+
'\tends here.'
|
|
56
|
+
]
|
|
57
|
+
<p id="mwEA"><span class="cx-segment" data-segmentid="0"><b id="mwEQ">Paris</b> (<small about="#mwt16" data-mw="{"parts":[{"template":{"target":{"wt":"IPA-fr","href":"./Template:IPA-fr"},"params":{"1":{"wt":"paʁi"},"3":{"wt":"Paris1.ogg"}},"i":0}}]}" id="mwEg" typeof="mw:Transclusion">French pronunciation:</small><span about="#mwt16" class="IPA" title="Representation in the International Phonetic Alphabet (IPA)"><a class="cx-link" data-linkid="1" href="./Help:IPA/French" rel="mw:WikiLink" title="Help:IPA/French">[paʁi]</a></span><small about="#mwt16" class="nowrap" id="mwEw"><span typeof="mw:Entity"> </span>(<span class="unicode haudio"><span class="fn"><span style="white-space:nowrap"><span data-mw="{"caption":"About this sound"}" typeof="mw:Image"><a href="./File:Paris1.ogg"><img data-file-height="20" data-file-type="drawing" data-file-width="20" height="11" resource="./File:Loudspeaker.svg" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/11px-Loudspeaker.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/22px-Loudspeaker.svg.png 2x, //upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/17px-Loudspeaker.svg.png 1.5x" width="11"></img></a></span><span typeof="mw:Entity"> </span></span><a href="//upload.wikimedia.org/wikipedia/commons/2/2c/Paris1.ogg" rel="mw:MediaLink" title="Paris1.ogg">listen</a></span></span>)</small>) Another sentence<span data-mw="{"caption":"A different caption"}" typeof="mw:Image"><a href="./File:Paris1232.ogg"><img data-file-height="20" data-file-type="drawing" data-file-width="20" height="11" resource="./File:Loudspeaker.svg" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/11px-Loudspeaker.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/22px-Loudspeaker.svg.png 2x, //upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/17px-Loudspeaker.svg.png 1.5x" width="11"></img></a></span>ends here.</span></p>
|
|
58
|
+
==Categories==
|
|
59
|
+
[]
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
Dr. Smith, who arrived at 3:00 p.m., said, "The patient's condition is stable; however, further tests (e.g., MRI, CT scan) are required.". Meanwhile, the nurse—who had been on duty since 7:00 a.m.—noted that the patient's vitals (BP: 120/80, HR: 75) were within normal limits. "Let's proceed with the tests ASAP," she added.
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
Hello!?..!..!
|
|
@@ -26,7 +26,7 @@ pub fn get_quote_pairs() -> HashMap<&'static str, &'static str> {
|
|
|
26
26
|
lazy_static::lazy_static! {
|
|
27
27
|
pub static ref PARENS_REGEX: Regex = Regex::new(r"[\((<{\[](?:[^\)\]}>)]|\\[\)\]}>)])*[\)\]}>)]").unwrap();
|
|
28
28
|
pub static ref EMAIL_REGEX: Regex = Regex::new(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}").unwrap();
|
|
29
|
-
pub static ref NUMBERED_REFERENCE_REGEX: Regex = Regex::new(r"^
|
|
29
|
+
pub static ref NUMBERED_REFERENCE_REGEX: Regex = Regex::new(r"^(\s*\[\d+])+").unwrap();
|
|
30
30
|
pub static ref SPACE_AFTER_SEPARATOR: Regex = Regex::new(r"^\s+").unwrap();
|
|
31
31
|
pub static ref QUOTES_REGEX: Regex = {
|
|
32
32
|
let quote_pairs = get_quote_pairs();
|
|
@@ -17,10 +17,6 @@ impl Language for Italian {
|
|
|
17
17
|
&ITALIAN_ABBREVIATIONS
|
|
18
18
|
}
|
|
19
19
|
|
|
20
|
-
fn is_punctuation_between_quotes(&self) -> bool {
|
|
21
|
-
false
|
|
22
|
-
}
|
|
23
|
-
|
|
24
20
|
fn get_last_word<'a>(&self, text: &'a str) -> &'a str {
|
|
25
21
|
let words: Vec<&str> = text
|
|
26
22
|
.split(|c: char| c.is_whitespace() || c == '.')
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
use regex::Regex;
|
|
2
|
+
use std::collections::HashMap;
|
|
3
|
+
use std::sync::Mutex;
|
|
4
|
+
|
|
5
|
+
use crate::SentenceBoundary;
|
|
6
|
+
use crate::constants::EMAIL_REGEX;
|
|
7
|
+
use crate::constants::EXCLAMATION_WORDS;
|
|
8
|
+
use crate::constants::GLOBAL_SENTENCE_TERMINATORS;
|
|
9
|
+
use crate::constants::PARENS_REGEX;
|
|
10
|
+
use crate::constants::QUOTES_REGEX;
|
|
11
|
+
|
|
12
|
+
lazy_static::lazy_static! {
|
|
13
|
+
static ref SENTENCE_BREAK_REGEX_CACHE: Mutex<HashMap<String, Regex>> = Mutex::new(HashMap::new());
|
|
14
|
+
static ref CONTINUE_REGEX: Regex = Regex::new(r"^[0-9a-z]").unwrap();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
#[derive(Debug, Clone, Copy, PartialEq)]
|
|
18
|
+
pub enum SkippableRangeType {
|
|
19
|
+
Quote,
|
|
20
|
+
Parentheses,
|
|
21
|
+
Email,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
#[derive(Debug, Clone, Copy)]
|
|
25
|
+
pub struct SkippableRange {
|
|
26
|
+
pub start: usize,
|
|
27
|
+
pub end: usize,
|
|
28
|
+
pub range_type: SkippableRangeType,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
impl SkippableRange {
|
|
32
|
+
pub fn new(start: usize, end: usize, range_type: SkippableRangeType) -> Self {
|
|
33
|
+
Self {
|
|
34
|
+
start,
|
|
35
|
+
end,
|
|
36
|
+
range_type,
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
pub fn contains(&self, position: usize) -> bool {
|
|
41
|
+
position > self.start && position < self.end
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
pub fn is_quote(&self) -> bool {
|
|
45
|
+
self.range_type == SkippableRangeType::Quote
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
pub trait Language {
|
|
50
|
+
/// Returns a compiled regex pattern that matches sentence terminating punctuation.
|
|
51
|
+
/// This regex is used to identify potential sentence boundaries in text.
|
|
52
|
+
/// The pattern is cached for performance and includes all global sentence terminators
|
|
53
|
+
/// like periods, exclamation marks, and question marks.
|
|
54
|
+
fn get_sentence_break_regex(&self) -> Regex {
|
|
55
|
+
let pattern = format!("[{}]+", GLOBAL_SENTENCE_TERMINATORS.join(""));
|
|
56
|
+
|
|
57
|
+
// Try to get from cache first
|
|
58
|
+
{
|
|
59
|
+
let cache = SENTENCE_BREAK_REGEX_CACHE.lock().unwrap();
|
|
60
|
+
if let Some(regex) = cache.get(&pattern) {
|
|
61
|
+
return regex.clone();
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Create new regex and cache it
|
|
66
|
+
let regex = Regex::new(&pattern).unwrap();
|
|
67
|
+
{
|
|
68
|
+
let mut cache = SENTENCE_BREAK_REGEX_CACHE.lock().unwrap();
|
|
69
|
+
cache.insert(pattern, regex.clone());
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
regex
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/// Analyzes the input text and returns a vector of sentence boundaries.
|
|
76
|
+
/// This is the main method for sentence segmentation that:
|
|
77
|
+
/// 1. Splits text into paragraphs at double newlines
|
|
78
|
+
/// 2. Identifies potential sentence breaks using regex patterns
|
|
79
|
+
/// 3. Filters out false positives (abbreviations, quotes, etc.)
|
|
80
|
+
/// 4. Returns structured boundary information including start/end positions and boundary symbols
|
|
81
|
+
/// Each boundary contains the sentence text, position indices, and metadata about the boundary type.
|
|
82
|
+
fn get_sentence_boundaries<'a>(&self, text: &'a str) -> Vec<SentenceBoundary<'a>> {
|
|
83
|
+
// Pre-allocate boundaries with estimated capacity (rough estimate: 1 sentence per 50 characters)
|
|
84
|
+
let estimated_sentences = (text.len() / 50).max(1);
|
|
85
|
+
let mut boundaries = Vec::with_capacity(estimated_sentences);
|
|
86
|
+
|
|
87
|
+
// Split by paragraph breaks (one or more newlines with optional whitespace)
|
|
88
|
+
let para_split_re = Regex::new(r"\n[\r]*\n").unwrap();
|
|
89
|
+
let paragraphs: Vec<&str> = para_split_re.split(text).collect();
|
|
90
|
+
|
|
91
|
+
// Pre-calculate all paragraph offsets in one pass
|
|
92
|
+
let mut paragraph_offsets = Vec::with_capacity(paragraphs.len());
|
|
93
|
+
let mut current_offset = 0;
|
|
94
|
+
for (i, paragraph) in paragraphs.iter().enumerate() {
|
|
95
|
+
paragraph_offsets.push(current_offset);
|
|
96
|
+
current_offset += paragraph.len();
|
|
97
|
+
if i < paragraphs.len() - 1 {
|
|
98
|
+
current_offset += 2; // for "\n\n"
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Pre-allocate sentence_boundaries once and reuse for all paragraphs
|
|
103
|
+
let estimated_paragraph_sentences = 10; // reasonable default for typical paragraphs
|
|
104
|
+
let mut sentence_boundaries = Vec::with_capacity(estimated_paragraph_sentences);
|
|
105
|
+
|
|
106
|
+
for (pindex, paragraph) in paragraphs.iter().enumerate() {
|
|
107
|
+
if pindex > 0 {
|
|
108
|
+
let paragraph_start = paragraph_offsets[pindex];
|
|
109
|
+
boundaries.push(SentenceBoundary {
|
|
110
|
+
start_index: paragraph_start,
|
|
111
|
+
end_index: paragraph_start + 2,
|
|
112
|
+
text: "\n\n",
|
|
113
|
+
boundary_symbol: None,
|
|
114
|
+
is_paragraph_break: true,
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
let paragraph_start_offset = if pindex == 0 {
|
|
119
|
+
0
|
|
120
|
+
} else {
|
|
121
|
+
paragraph_offsets[pindex] + 2
|
|
122
|
+
};
|
|
123
|
+
sentence_boundaries.clear();
|
|
124
|
+
sentence_boundaries.push(0);
|
|
125
|
+
|
|
126
|
+
let matches: Vec<(usize, usize)> = self
|
|
127
|
+
.get_sentence_break_regex()
|
|
128
|
+
.find_iter(paragraph)
|
|
129
|
+
.map(|m| (m.start(), m.end()))
|
|
130
|
+
.collect();
|
|
131
|
+
let skippable_ranges = self.get_skippable_ranges(paragraph);
|
|
132
|
+
|
|
133
|
+
for (start, end) in matches {
|
|
134
|
+
let mut boundary = self
|
|
135
|
+
.find_boundary(paragraph, start, end)
|
|
136
|
+
.unwrap_or(usize::MAX);
|
|
137
|
+
|
|
138
|
+
if boundary == usize::MAX {
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
let mut in_range = false;
|
|
143
|
+
|
|
144
|
+
for range in &skippable_ranges {
|
|
145
|
+
if range.contains(boundary) {
|
|
146
|
+
let next_word = self.get_next_word_approx(text, range.end);
|
|
147
|
+
let boundary_extend = self.get_boundary_extend(next_word);
|
|
148
|
+
if range.is_quote() && boundary + 1 == range.end && boundary_extend >= 0 {
|
|
149
|
+
boundary = range.end + boundary_extend as usize;
|
|
150
|
+
in_range = false;
|
|
151
|
+
} else {
|
|
152
|
+
in_range = true;
|
|
153
|
+
}
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if in_range {
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
sentence_boundaries.push(boundary);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
if *sentence_boundaries.last().unwrap() != paragraph.len() {
|
|
166
|
+
sentence_boundaries.push(paragraph.len());
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
for i in 0..sentence_boundaries.len() - 1 {
|
|
170
|
+
let start = sentence_boundaries[i];
|
|
171
|
+
let end = sentence_boundaries[i + 1];
|
|
172
|
+
|
|
173
|
+
if start >= paragraph.len() || end > paragraph.len() || start > end {
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
let sentence_text = ¶graph[start..end];
|
|
178
|
+
let boundary_symbol = if end > 0 && end <= paragraph.len() {
|
|
179
|
+
// Use char_indices for more efficient character iteration
|
|
180
|
+
paragraph[..end]
|
|
181
|
+
.char_indices()
|
|
182
|
+
.next_back()
|
|
183
|
+
.and_then(|(idx, _)| {
|
|
184
|
+
let char_str = ¶graph[idx..end];
|
|
185
|
+
if GLOBAL_SENTENCE_TERMINATORS.contains(&char_str) {
|
|
186
|
+
Some(char_str.to_string())
|
|
187
|
+
} else {
|
|
188
|
+
None
|
|
189
|
+
}
|
|
190
|
+
})
|
|
191
|
+
} else {
|
|
192
|
+
None
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
boundaries.push(SentenceBoundary {
|
|
196
|
+
start_index: paragraph_start_offset + start,
|
|
197
|
+
end_index: paragraph_start_offset + end,
|
|
198
|
+
text: sentence_text,
|
|
199
|
+
boundary_symbol,
|
|
200
|
+
is_paragraph_break: false,
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
boundaries
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/// Segments the input text into individual sentences and returns them as string slices.
|
|
209
|
+
/// This is a convenience method that builds on get_sentence_boundaries() but returns
|
|
210
|
+
/// only the sentence text content without the additional boundary metadata.
|
|
211
|
+
/// Used when you only need the segmented sentences and not their position information.
|
|
212
|
+
fn segment<'a>(&self, text: &'a str) -> Vec<&'a str> {
|
|
213
|
+
// Pre-allocate with estimated capacity based on text length
|
|
214
|
+
let estimated_sentences = (text.len() / 50).max(1);
|
|
215
|
+
let mut sentences = Vec::with_capacity(estimated_sentences);
|
|
216
|
+
|
|
217
|
+
let boundaries = self.get_sentence_boundaries(text);
|
|
218
|
+
for boundary in boundaries {
|
|
219
|
+
if !boundary.text.is_empty() {
|
|
220
|
+
sentences.push(boundary.text);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
sentences
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/// Returns the character used to mark abbreviations in this language.
|
|
228
|
+
/// By default returns "." (period), but should be overridden by specific languages
|
|
229
|
+
/// that use different abbreviation markers. Used by the abbreviation detection logic
|
|
230
|
+
/// to determine if a potential sentence boundary is actually an abbreviation.
|
|
231
|
+
fn get_abbreviation_char(&self) -> &str {
|
|
232
|
+
"."
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/// Returns a list of known abbreviations for this language.
|
|
236
|
+
/// These are used to prevent false sentence breaks at abbreviation periods.
|
|
237
|
+
/// For example, "Dr." or "etc." should not trigger a sentence boundary.
|
|
238
|
+
/// Languages should override this to provide their specific abbreviation lists.
|
|
239
|
+
/// Returns an empty slice by default.
|
|
240
|
+
fn get_abbreviations(&self) -> &[String] {
|
|
241
|
+
&[]
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/// Determines how many characters to extend a boundary when continuing into the next word.
|
|
245
|
+
/// Returns -1 if the word indicates the boundary should not be created (continuation case).
|
|
246
|
+
/// Returns 0 or positive number indicating how many whitespace/punctuation characters
|
|
247
|
+
/// to skip when positioning the boundary. Used to handle cases like quoted sentences
|
|
248
|
+
/// where the boundary should include trailing punctuation and whitespace.
|
|
249
|
+
fn get_boundary_extend(&self, word: &str) -> i8 {
|
|
250
|
+
if self.continue_in_next_word(word.trim()) {
|
|
251
|
+
// not a boundary.
|
|
252
|
+
return -1;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
let mut count = 0i8;
|
|
256
|
+
for ch in word.chars() {
|
|
257
|
+
if ch.is_whitespace() || GLOBAL_SENTENCE_TERMINATORS.contains(&ch.to_string().as_str())
|
|
258
|
+
{
|
|
259
|
+
count += 1;
|
|
260
|
+
if count == i8::MAX {
|
|
261
|
+
break; // Prevent overflow
|
|
262
|
+
}
|
|
263
|
+
} else {
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
word.ceil_char_boundary(count as usize) as i8
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/// Checks if a potential sentence boundary is actually part of an abbreviation.
|
|
272
|
+
/// Examines the text before the separator to see if it ends with a known abbreviation.
|
|
273
|
+
/// Returns true if this appears to be an abbreviation (and thus not a sentence boundary),
|
|
274
|
+
/// false if it's likely a genuine sentence end. Used to prevent breaking sentences
|
|
275
|
+
/// at abbreviations like "Dr. Smith" or "etc."
|
|
276
|
+
fn is_abbreviation(&self, head: &str, _tail: &str, separator: &str) -> bool {
|
|
277
|
+
if self.get_abbreviation_char() != separator {
|
|
278
|
+
return false;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
let last_word = self.get_last_word(head);
|
|
282
|
+
|
|
283
|
+
if last_word.is_empty() {
|
|
284
|
+
return false;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
let abbreviations = self.get_abbreviations();
|
|
288
|
+
let is_abbrev = abbreviations.contains(&last_word.to_string());
|
|
289
|
+
let is_abbrev_lower = abbreviations.contains(&last_word.to_lowercase());
|
|
290
|
+
let is_abbrev_upper = abbreviations.contains(&last_word.to_uppercase());
|
|
291
|
+
|
|
292
|
+
is_abbrev || is_abbrev_lower || is_abbrev_upper
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/// Extracts the last word from the given text by splitting on whitespace and periods.
|
|
296
|
+
/// Used primarily by abbreviation detection to check if the word before a potential
|
|
297
|
+
/// sentence boundary is a known abbreviation. Returns an empty string if no words
|
|
298
|
+
/// are found. This is a performance-optimized version that avoids collecting all words.
|
|
299
|
+
fn get_last_word<'a>(&self, text: &'a str) -> &'a str {
|
|
300
|
+
// Find the last word without collecting all words
|
|
301
|
+
text.split(|c: char| c.is_whitespace() || c == '.')
|
|
302
|
+
.last()
|
|
303
|
+
.unwrap_or("")
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/// Checks if a potential sentence boundary is actually an exclamation word that shouldn't
|
|
307
|
+
/// trigger a sentence break. Examines the last word before the boundary and checks if
|
|
308
|
+
/// it's in the list of known exclamation words (like "Hey!" or "Wow!").
|
|
309
|
+
/// Returns true if this is an exclamation that should not break the sentence.
|
|
310
|
+
fn is_exclamation(&self, head: &str, _tail: &str) -> bool {
|
|
311
|
+
let last_word = self.get_last_word(head);
|
|
312
|
+
let exclamation_word = format!("{}!", last_word);
|
|
313
|
+
EXCLAMATION_WORDS.contains(&exclamation_word.as_str())
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/// Returns an approximate substring of the next word(s) starting from the given position.
|
|
317
|
+
/// Limited to a maximum of 30 characters for performance. Used to analyze context
|
|
318
|
+
/// after a potential sentence boundary to determine if the boundary should be created.
|
|
319
|
+
/// Handles UTF-8 character boundaries safely to avoid panics on non-ASCII text.
|
|
320
|
+
fn get_next_word_approx<'a>(&self, text: &'a str, start: usize) -> &'a str {
|
|
321
|
+
if start >= text.len() {
|
|
322
|
+
return "";
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
let max_chars = 30;
|
|
326
|
+
let safe_start = text.floor_char_boundary(start);
|
|
327
|
+
let end_pos = (start + max_chars).min(text.len());
|
|
328
|
+
&text[safe_start..text.floor_char_boundary(end_pos)]
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/// Analyzes a potential sentence boundary and determines the exact position where
|
|
332
|
+
/// the sentence should end, or returns None if this shouldn't be a boundary.
|
|
333
|
+
/// Considers abbreviations, exclamations, numbered references, and continuation patterns.
|
|
334
|
+
/// This is the core logic that distinguishes true sentence boundaries from false positives
|
|
335
|
+
/// like abbreviations or mid-sentence punctuation.
|
|
336
|
+
fn find_boundary(&self, text: &str, start: usize, end: usize) -> Option<usize> {
|
|
337
|
+
let head = &text[..start];
|
|
338
|
+
let next_index = text.ceil_char_boundary(start + 1);
|
|
339
|
+
|
|
340
|
+
let next_word_approx = self.get_next_word_approx(text, next_index);
|
|
341
|
+
|
|
342
|
+
if let Some(number_ref_match) =
|
|
343
|
+
crate::constants::NUMBERED_REFERENCE_REGEX.find(next_word_approx)
|
|
344
|
+
{
|
|
345
|
+
return Some(next_index + number_ref_match.end());
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
if self.continue_in_next_word(next_word_approx) {
|
|
349
|
+
return None;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
if self.is_abbreviation(head, next_word_approx, &text[start..end]) {
|
|
353
|
+
return None;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
if self.is_exclamation(head, next_word_approx) {
|
|
357
|
+
return None;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
if let Some(space_after_sep_match) =
|
|
361
|
+
crate::constants::SPACE_AFTER_SEPARATOR.find(next_word_approx)
|
|
362
|
+
{
|
|
363
|
+
return Some(next_index + space_after_sep_match.end());
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
Some(end)
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
/// Determines if the text after a potential boundary indicates the sentence should continue.
|
|
370
|
+
/// Returns true if the next word starts with a lowercase letter or number, suggesting
|
|
371
|
+
/// the sentence is continuing rather than starting a new one. This helps avoid breaking
|
|
372
|
+
/// sentences at abbreviations or in the middle of compound sentences.
|
|
373
|
+
fn continue_in_next_word(&self, text_after_boundary: &str) -> bool {
|
|
374
|
+
CONTINUE_REGEX.is_match(text_after_boundary)
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/// Identifies ranges of text that should be skipped during sentence boundary detection.
|
|
378
|
+
/// This includes quoted text, parenthetical expressions, and email addresses where
|
|
379
|
+
/// internal punctuation should not trigger sentence breaks. Returns a sorted vector
|
|
380
|
+
/// of ranges that can be efficiently checked during boundary detection to avoid
|
|
381
|
+
/// false positives within these special text regions.
|
|
382
|
+
fn get_skippable_ranges(&self, text: &str) -> Vec<SkippableRange> {
|
|
383
|
+
// Pre-allocate with estimated capacity based on text length (rough estimate: 1 range per 200 characters)
|
|
384
|
+
let estimated_ranges = (text.len() / 200).max(1);
|
|
385
|
+
let mut skippable_ranges = Vec::with_capacity(estimated_ranges);
|
|
386
|
+
|
|
387
|
+
for mat in QUOTES_REGEX.find_iter(text) {
|
|
388
|
+
skippable_ranges.push(SkippableRange::new(
|
|
389
|
+
mat.start(),
|
|
390
|
+
mat.end(),
|
|
391
|
+
SkippableRangeType::Quote,
|
|
392
|
+
));
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
for mat in PARENS_REGEX.find_iter(text) {
|
|
396
|
+
skippable_ranges.push(SkippableRange::new(
|
|
397
|
+
mat.start(),
|
|
398
|
+
mat.end(),
|
|
399
|
+
SkippableRangeType::Parentheses,
|
|
400
|
+
));
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
for mat in EMAIL_REGEX.find_iter(text) {
|
|
404
|
+
skippable_ranges.push(SkippableRange::new(
|
|
405
|
+
mat.start(),
|
|
406
|
+
mat.end(),
|
|
407
|
+
SkippableRangeType::Email,
|
|
408
|
+
));
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// Sort ranges by start position for more efficient lookups
|
|
412
|
+
skippable_ranges.sort_unstable_by_key(|r| r.start);
|
|
413
|
+
skippable_ranges
|
|
414
|
+
}
|
|
415
|
+
}
|
|
@@ -84,21 +84,6 @@ pub fn language_factory(language_code: &str) -> Box<dyn Language> {
|
|
|
84
84
|
}
|
|
85
85
|
}
|
|
86
86
|
|
|
87
|
-
/// Find the nearest valid UTF-8 character boundary at or before the given byte index
|
|
88
|
-
fn find_char_boundary(text: &str, mut byte_index: usize) -> usize {
|
|
89
|
-
// If we're already at or past the end, return text length
|
|
90
|
-
if byte_index >= text.len() {
|
|
91
|
-
return text.len();
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// Walk forwards until we find a valid character boundary
|
|
95
|
-
while byte_index < text.len() && !text.is_char_boundary(byte_index) {
|
|
96
|
-
byte_index += 1;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
byte_index
|
|
100
|
-
}
|
|
101
|
-
|
|
102
87
|
fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
|
|
103
88
|
if chunk_size == 0 || text.len() <= chunk_size {
|
|
104
89
|
return vec![text];
|
|
@@ -107,7 +92,7 @@ fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
|
|
|
107
92
|
let mut chunks = Vec::new();
|
|
108
93
|
|
|
109
94
|
// Split by paragraph breaks (one or more newlines with optional whitespace)
|
|
110
|
-
let re = Regex::new(r"\n[\r
|
|
95
|
+
let re = Regex::new(r"\n[\r]*\n").unwrap();
|
|
111
96
|
|
|
112
97
|
// Get paragraph parts and their positions
|
|
113
98
|
let mut paragraphs = Vec::new();
|
|
@@ -150,7 +135,7 @@ fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
|
|
|
150
135
|
|
|
151
136
|
if potential_size > chunk_size {
|
|
152
137
|
// Finalize current chunk
|
|
153
|
-
let safe_end =
|
|
138
|
+
let safe_end = text.ceil_char_boundary(current_end);
|
|
154
139
|
chunks.push(&text[current_start..safe_end]);
|
|
155
140
|
|
|
156
141
|
// Start new chunk with current paragraph
|
|
@@ -166,7 +151,7 @@ fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
|
|
|
166
151
|
|
|
167
152
|
// Add the final chunk if there's remaining content
|
|
168
153
|
if current_start < text.len() {
|
|
169
|
-
let safe_end =
|
|
154
|
+
let safe_end = text.ceil_char_boundary(current_end);
|
|
170
155
|
chunks.push(&text[current_start..safe_end]);
|
|
171
156
|
}
|
|
172
157
|
|
|
@@ -59,7 +59,7 @@ fn main() {
|
|
|
59
59
|
let sentences = segment(&cli.language, &text);
|
|
60
60
|
let elapsed = start_time.elapsed();
|
|
61
61
|
for sentence in sentences.iter() {
|
|
62
|
-
println!("{}", sentence);
|
|
62
|
+
println!("* {}", sentence);
|
|
63
63
|
}
|
|
64
64
|
|
|
65
65
|
eprintln!("Time taken for segment(): {:?}", elapsed);
|