pysbd-plus 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. pysbd_plus-0.3.5/LICENSE +21 -0
  2. pysbd_plus-0.3.5/MANIFEST.in +1 -0
  3. pysbd_plus-0.3.5/PKG-INFO +139 -0
  4. pysbd_plus-0.3.5/README.md +98 -0
  5. pysbd_plus-0.3.5/benchmarks/__init__.py +0 -0
  6. pysbd_plus-0.3.5/benchmarks/benchmark_sbd_tools.py +84 -0
  7. pysbd_plus-0.3.5/benchmarks/bigtext_speed_benchmark.py +75 -0
  8. pysbd_plus-0.3.5/benchmarks/english_golden_rules.py +210 -0
  9. pysbd_plus-0.3.5/benchmarks/genia_benchmark.py +100 -0
  10. pysbd_plus-0.3.5/benchmarks/indonesian_golden_rules.py +247 -0
  11. pysbd_plus-0.3.5/benchmarks/latency_en_vs_id.py +70 -0
  12. pysbd_plus-0.3.5/pysbd/__init__.py +2 -0
  13. pysbd_plus-0.3.5/pysbd/abbreviation_replacer.py +112 -0
  14. pysbd_plus-0.3.5/pysbd/about.py +10 -0
  15. pysbd_plus-0.3.5/pysbd/between_punctuation.py +94 -0
  16. pysbd_plus-0.3.5/pysbd/clean/__init__.py +0 -0
  17. pysbd_plus-0.3.5/pysbd/clean/rules.py +80 -0
  18. pysbd_plus-0.3.5/pysbd/cleaner.py +111 -0
  19. pysbd_plus-0.3.5/pysbd/exclamation_words.py +17 -0
  20. pysbd_plus-0.3.5/pysbd/lang/__init__.py +0 -0
  21. pysbd_plus-0.3.5/pysbd/lang/amharic.py +13 -0
  22. pysbd_plus-0.3.5/pysbd/lang/arabic.py +35 -0
  23. pysbd_plus-0.3.5/pysbd/lang/armenian.py +13 -0
  24. pysbd_plus-0.3.5/pysbd/lang/bulgarian.py +24 -0
  25. pysbd_plus-0.3.5/pysbd/lang/burmese.py +13 -0
  26. pysbd_plus-0.3.5/pysbd/lang/chinese.py +36 -0
  27. pysbd_plus-0.3.5/pysbd/lang/common/__init__.py +2 -0
  28. pysbd_plus-0.3.5/pysbd/lang/common/common.py +91 -0
  29. pysbd_plus-0.3.5/pysbd/lang/common/standard.py +113 -0
  30. pysbd_plus-0.3.5/pysbd/lang/danish.py +40 -0
  31. pysbd_plus-0.3.5/pysbd/lang/deutsch.py +97 -0
  32. pysbd_plus-0.3.5/pysbd/lang/dutch.py +12 -0
  33. pysbd_plus-0.3.5/pysbd/lang/english.py +11 -0
  34. pysbd_plus-0.3.5/pysbd/lang/french.py +15 -0
  35. pysbd_plus-0.3.5/pysbd/lang/greek.py +13 -0
  36. pysbd_plus-0.3.5/pysbd/lang/hindi.py +13 -0
  37. pysbd_plus-0.3.5/pysbd/lang/indonesian.py +60 -0
  38. pysbd_plus-0.3.5/pysbd/lang/italian.py +15 -0
  39. pysbd_plus-0.3.5/pysbd/lang/japanese.py +51 -0
  40. pysbd_plus-0.3.5/pysbd/lang/kazakh.py +50 -0
  41. pysbd_plus-0.3.5/pysbd/lang/marathi.py +14 -0
  42. pysbd_plus-0.3.5/pysbd/lang/persian.py +30 -0
  43. pysbd_plus-0.3.5/pysbd/lang/polish.py +15 -0
  44. pysbd_plus-0.3.5/pysbd/lang/russian.py +27 -0
  45. pysbd_plus-0.3.5/pysbd/lang/slovak.py +111 -0
  46. pysbd_plus-0.3.5/pysbd/lang/spanish.py +15 -0
  47. pysbd_plus-0.3.5/pysbd/lang/urdu.py +13 -0
  48. pysbd_plus-0.3.5/pysbd/languages.py +66 -0
  49. pysbd_plus-0.3.5/pysbd/lists_item_replacer.py +240 -0
  50. pysbd_plus-0.3.5/pysbd/processor.py +204 -0
  51. pysbd_plus-0.3.5/pysbd/punctuation_replacer.py +44 -0
  52. pysbd_plus-0.3.5/pysbd/segmenter.py +96 -0
  53. pysbd_plus-0.3.5/pysbd/utils.py +81 -0
  54. pysbd_plus-0.3.5/pysbd_plus.egg-info/PKG-INFO +139 -0
  55. pysbd_plus-0.3.5/pysbd_plus.egg-info/SOURCES.txt +61 -0
  56. pysbd_plus-0.3.5/pysbd_plus.egg-info/dependency_links.txt +1 -0
  57. pysbd_plus-0.3.5/pysbd_plus.egg-info/entry_points.txt +2 -0
  58. pysbd_plus-0.3.5/pysbd_plus.egg-info/top_level.txt +2 -0
  59. pysbd_plus-0.3.5/setup.cfg +4 -0
  60. pysbd_plus-0.3.5/setup.py +110 -0
  61. pysbd_plus-0.3.5/tests/test_cleaner.py +27 -0
  62. pysbd_plus-0.3.5/tests/test_languages.py +17 -0
  63. pysbd_plus-0.3.5/tests/test_segmenter.py +116 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2019 Nipun Sadvilkar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ include README.md LICENSE
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: pysbd-plus
3
+ Version: 0.3.5
4
+ Summary: pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages.
5
+ Home-page: http://nipunsadvilkar.github.io/
6
+ Author: Nipun Sadvilkar
7
+ Author-email: nipunsadvilkar@gmail.com
8
+ License: MIT
9
+ Keywords: natural-language-processing nlp
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
+ Classifier: Topic :: Text Processing
16
+ Classifier: Topic :: Text Processing :: Linguistic
17
+ Classifier: Topic :: Software Development
18
+ Classifier: Topic :: Software Development :: Libraries
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: License :: OSI Approved :: MIT License
26
+ Requires-Python: >=3.9
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: keywords
36
+ Dynamic: license
37
+ Dynamic: license-file
38
+ Dynamic: requires-python
39
+ Dynamic: summary
40
+
41
+
42
+ ![PySBD logo](artifacts/pysbd_logo.png?raw=true "pysbd logo")
43
+ # pySBD: Python Sentence Boundary Disambiguation (SBD)
44
+
45
+ ![Python package](https://github.com/nipunsadvilkar/pySBD/workflows/Python%20package/badge.svg) [![codecov](https://codecov.io/gh/nipunsadvilkar/pySBD/branch/master/graph/badge.svg)](https://codecov.io/gh/nipunsadvilkar/pySBD) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/nipunsadvilkar/pySBD/blob/master/LICENSE) [![PyPi](https://img.shields.io/pypi/v/pysbd?color=blue&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/pysbd) [![GitHub](https://img.shields.io/github/v/release/nipunsadvilkar/pySBD.svg?include_prereleases&logo=github&style=flat)](https://github.com/nipunsadvilkar/pySBD)
46
+
47
+ pySBD - python Sentence Boundary Disambiguation (SBD) - is a rule-based sentence boundary detection module that works out-of-the-box.
48
+
49
+ This project is a direct port of ruby gem - [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter) which provides rule-based sentence boundary detection.
50
+
51
+ ![pysbd_code](artifacts/pysbd_code.png?raw=true "pysbd_code")
52
+
53
+ ## Highlights
54
+ **'PySBD: Pragmatic Sentence Boundary Disambiguation'** a short research paper got accepted into 2nd Workshop for Natural Language Processing Open Source Software (NLP-OSS) at EMNLP 2020. </br>
55
+
56
+ **Research Paper:**</br>
57
+
58
+ https://arxiv.org/abs/2010.09657</br>
59
+
60
+ **[Recorded Talk:](https://slideslive.com/38939754)**</br>
61
+
62
+ [![pysbd_talk](artifacts/pysbd_talk.png)](https://slideslive.com/38939754)</br>
63
+
64
+ **Poster:**</br>
65
+
66
+ [![name](artifacts/pysbd_poster.png)](artifacts/pysbd_poster.png)
67
+
68
+ ## Install
69
+
70
+ **Python**
71
+
72
+ pip install pysbd
73
+
74
+ ## Usage
75
+
76
+ - Currently pySBD supports 22 languages.
77
+
78
+ ```python
79
+ import pysbd
80
+ text = "My name is Jonas E. Smith. Please turn to p. 55."
81
+ seg = pysbd.Segmenter(language="en", clean=False)
82
+ print(seg.segment(text))
83
+ # ['My name is Jonas E. Smith.', 'Please turn to p. 55.']
84
+ ```
85
+
86
+ - Use `pysbd` as a [spaCy](https://spacy.io/usage/processing-pipelines) pipeline component. (recommended)</br>Please refer to example [pysbd\_as\_spacy\_component.py](https://github.com/nipunsadvilkar/pySBD/blob/master/examples/pysbd_as_spacy_component.py)
87
+ - Use pysbd through [entrypoints](https://spacy.io/usage/saving-loading#entry-points-components)
88
+
89
+ ```python
90
+ import spacy
91
+ from pysbd.utils import PySBDFactory
92
+
93
+ nlp = spacy.blank('en')
94
+
95
+ # explicitly adding component to pipeline
96
+ # (recommended - makes it more readable to tell what's going on)
97
+ nlp.add_pipe(PySBDFactory(nlp))
98
+
99
+ # or you can use it implicitly with keyword
100
+ # pysbd = nlp.create_pipe('pysbd')
101
+ # nlp.add_pipe(pysbd)
102
+
103
+ doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')
104
+ print(list(doc.sents))
105
+ # [My name is Jonas E. Smith., Please turn to p. 55.]
106
+
107
+ ```
108
+
109
+ ## Contributing
110
+
111
+ If you want to contribute new feature/language support or found a text that is incorrectly segmented using pySBD, then please head to [CONTRIBUTING.md](https://github.com/nipunsadvilkar/pySBD/blob/master/CONTRIBUTING.md) to know more and follow these steps.
112
+
113
+ 1. Fork it ( https://github.com/nipunsadvilkar/pySBD/fork )
114
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
115
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
116
+ 4. Push to the branch (`git push origin my-new-feature`)
117
+ 5. Create a new Pull Request
118
+
119
+ ## Citation
120
+ If you use `pysbd` package in your projects or research, please cite [PySBD: Pragmatic Sentence Boundary Disambiguation](https://www.aclweb.org/anthology/2020.nlposs-1.15).
121
+ ```
122
+ @inproceedings{sadvilkar-neumann-2020-pysbd,
123
+ title = "{P}y{SBD}: Pragmatic Sentence Boundary Disambiguation",
124
+ author = "Sadvilkar, Nipun and
125
+ Neumann, Mark",
126
+ booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)",
127
+ month = nov,
128
+ year = "2020",
129
+ address = "Online",
130
+ publisher = "Association for Computational Linguistics",
131
+ url = "https://www.aclweb.org/anthology/2020.nlposs-1.15",
132
+ pages = "110--114",
133
+ abstract = "We present a rule-based sentence boundary disambiguation Python package that works out-of-the-box for 22 languages. We aim to provide a realistic segmenter which can provide logical sentences even when the format and domain of the input text is unknown. In our work, we adapt the Golden Rules Set (a language specific set of sentence boundary exemplars) originally implemented as a ruby gem pragmatic segmenter which we ported to Python with additional improvements and functionality. PySBD passes 97.92{\%} of the Golden Rule Set examplars for English, an improvement of 25{\%} over the next best open source Python tool.",
134
+ }
135
+ ```
136
+
137
+ ## Credit
138
+
139
+ This project wouldn't be possible without the great work done by [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter) team.
@@ -0,0 +1,98 @@
1
+ ![PySBD logo](artifacts/pysbd_logo.png?raw=true "pysbd logo")
2
+ # pySBD: Python Sentence Boundary Disambiguation (SBD)
3
+
4
+ ![Python package](https://github.com/nipunsadvilkar/pySBD/workflows/Python%20package/badge.svg) [![codecov](https://codecov.io/gh/nipunsadvilkar/pySBD/branch/master/graph/badge.svg)](https://codecov.io/gh/nipunsadvilkar/pySBD) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/nipunsadvilkar/pySBD/blob/master/LICENSE) [![PyPi](https://img.shields.io/pypi/v/pysbd?color=blue&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/pysbd) [![GitHub](https://img.shields.io/github/v/release/nipunsadvilkar/pySBD.svg?include_prereleases&logo=github&style=flat)](https://github.com/nipunsadvilkar/pySBD)
5
+
6
+ pySBD - python Sentence Boundary Disambiguation (SBD) - is a rule-based sentence boundary detection module that works out-of-the-box.
7
+
8
+ This project is a direct port of ruby gem - [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter) which provides rule-based sentence boundary detection.
9
+
10
+ ![pysbd_code](artifacts/pysbd_code.png?raw=true "pysbd_code")
11
+
12
+ ## Highlights
13
+ **'PySBD: Pragmatic Sentence Boundary Disambiguation'** a short research paper got accepted into 2nd Workshop for Natural Language Processing Open Source Software (NLP-OSS) at EMNLP 2020. </br>
14
+
15
+ **Research Paper:**</br>
16
+
17
+ https://arxiv.org/abs/2010.09657</br>
18
+
19
+ **[Recorded Talk:](https://slideslive.com/38939754)**</br>
20
+
21
+ [![pysbd_talk](artifacts/pysbd_talk.png)](https://slideslive.com/38939754)</br>
22
+
23
+ **Poster:**</br>
24
+
25
+ [![name](artifacts/pysbd_poster.png)](artifacts/pysbd_poster.png)
26
+
27
+ ## Install
28
+
29
+ **Python**
30
+
31
+ pip install pysbd
32
+
33
+ ## Usage
34
+
35
+ - Currently pySBD supports 22 languages.
36
+
37
+ ```python
38
+ import pysbd
39
+ text = "My name is Jonas E. Smith. Please turn to p. 55."
40
+ seg = pysbd.Segmenter(language="en", clean=False)
41
+ print(seg.segment(text))
42
+ # ['My name is Jonas E. Smith.', 'Please turn to p. 55.']
43
+ ```
44
+
45
+ - Use `pysbd` as a [spaCy](https://spacy.io/usage/processing-pipelines) pipeline component. (recommended)</br>Please refer to example [pysbd\_as\_spacy\_component.py](https://github.com/nipunsadvilkar/pySBD/blob/master/examples/pysbd_as_spacy_component.py)
46
+ - Use pysbd through [entrypoints](https://spacy.io/usage/saving-loading#entry-points-components)
47
+
48
+ ```python
49
+ import spacy
50
+ from pysbd.utils import PySBDFactory
51
+
52
+ nlp = spacy.blank('en')
53
+
54
+ # explicitly adding component to pipeline
55
+ # (recommended - makes it more readable to tell what's going on)
56
+ nlp.add_pipe(PySBDFactory(nlp))
57
+
58
+ # or you can use it implicitly with keyword
59
+ # pysbd = nlp.create_pipe('pysbd')
60
+ # nlp.add_pipe(pysbd)
61
+
62
+ doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')
63
+ print(list(doc.sents))
64
+ # [My name is Jonas E. Smith., Please turn to p. 55.]
65
+
66
+ ```
67
+
68
+ ## Contributing
69
+
70
+ If you want to contribute new feature/language support or found a text that is incorrectly segmented using pySBD, then please head to [CONTRIBUTING.md](https://github.com/nipunsadvilkar/pySBD/blob/master/CONTRIBUTING.md) to know more and follow these steps.
71
+
72
+ 1. Fork it ( https://github.com/nipunsadvilkar/pySBD/fork )
73
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
74
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
75
+ 4. Push to the branch (`git push origin my-new-feature`)
76
+ 5. Create a new Pull Request
77
+
78
+ ## Citation
79
+ If you use `pysbd` package in your projects or research, please cite [PySBD: Pragmatic Sentence Boundary Disambiguation](https://www.aclweb.org/anthology/2020.nlposs-1.15).
80
+ ```
81
+ @inproceedings{sadvilkar-neumann-2020-pysbd,
82
+ title = "{P}y{SBD}: Pragmatic Sentence Boundary Disambiguation",
83
+ author = "Sadvilkar, Nipun and
84
+ Neumann, Mark",
85
+ booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)",
86
+ month = nov,
87
+ year = "2020",
88
+ address = "Online",
89
+ publisher = "Association for Computational Linguistics",
90
+ url = "https://www.aclweb.org/anthology/2020.nlposs-1.15",
91
+ pages = "110--114",
92
+ abstract = "We present a rule-based sentence boundary disambiguation Python package that works out-of-the-box for 22 languages. We aim to provide a realistic segmenter which can provide logical sentences even when the format and domain of the input text is unknown. In our work, we adapt the Golden Rules Set (a language specific set of sentence boundary exemplars) originally implemented as a ruby gem pragmatic segmenter which we ported to Python with additional improvements and functionality. PySBD passes 97.92{\%} of the Golden Rule Set examplars for English, an improvement of 25{\%} over the next best open source Python tool.",
93
+ }
94
+ ```
95
+
96
+ ## Credit
97
+
98
+ This project wouldn't be possible without the great work done by [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter) team.
File without changes
@@ -0,0 +1,84 @@
1
+ import blingfire
2
+ import nltk
3
+ import pysbd
4
+ import spacy
5
+ import stanza
6
+
7
+ from syntok.tokenizer import Tokenizer
8
+ import syntok.segmenter as syntok_segmenter
9
+
10
+ from english_golden_rules import GOLDEN_EN_RULES
11
+
12
+ pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
13
+
14
+ nlp = spacy.blank('en')
15
+ nlp.add_pipe(nlp.create_pipe("sentencizer"))
16
+ nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
17
+ #stanza.download('en')
18
+ stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
19
+
20
+ syntok_tokenizer = Tokenizer()
21
+
22
+ def blingfire_tokenize(text):
23
+ return blingfire.text_to_sentences(text).split('\n')
24
+
25
+ def nltk_tokenize(text):
26
+ return nltk.sent_tokenize(text)
27
+
28
+ def pysbd_tokenize(text):
29
+ segments = pysbd_segmenter.segment(text)
30
+ return [s.strip() for s in segments]
31
+
32
+ def spacy_tokenize(text):
33
+ return [sent.text for sent in nlp(text).sents]
34
+
35
+ def spacy_dep_tokenize(text):
36
+ return [sent.text for sent in nlp_dep(text).sents]
37
+
38
+ def stanza_tokenize(text):
39
+ return [e.text for e in stanza_nlp(text).sentences]
40
+
41
+ def make_sentences(segmented_tokens):
42
+ for sentence in segmented_tokens:
43
+ yield "".join(str(token) for token in sentence).strip()
44
+
45
+ def syntok_tokenize(text):
46
+ tokens = syntok_tokenizer.split(text)
47
+ result = syntok_segmenter.split(iter(tokens))
48
+ segments = [sent for sent in make_sentences(result)]
49
+ return segments
50
+
51
+
52
+ total_rules = len(GOLDEN_EN_RULES)
53
+
54
+ def benchmark(golden_rules, tokenize_func):
55
+ score = 0
56
+ for rule in golden_rules:
57
+ text, expected = rule
58
+ segments = tokenize_func(text)
59
+ if segments == expected:
60
+ score += 1
61
+ percent_score = (score / total_rules) * 100.0
62
+
63
+ return percent_score
64
+
65
+ if __name__ == "__main__":
66
+ import time
67
+ libraries = (
68
+ blingfire_tokenize,
69
+ nltk_tokenize,
70
+ pysbd_tokenize,
71
+ spacy_tokenize,
72
+ spacy_dep_tokenize,
73
+ stanza_tokenize,
74
+ syntok_tokenize)
75
+ for tokenize_func in libraries:
76
+ t = time.time()
77
+ for i in range(100):
78
+ percent_score = benchmark(GOLDEN_EN_RULES, tokenize_func)
79
+
80
+ time_taken = time.time() - t
81
+ print()
82
+ print(tokenize_func.__name__)
83
+ print('GRS score: {:0.2f}%'.format(percent_score))
84
+ print('Speed(Avg over 100 runs): {:>10.2f} ms'.format(time_taken*1000/100))
@@ -0,0 +1,75 @@
1
+ import blingfire
2
+ import nltk
3
+ import pysbd
4
+ import spacy
5
+ import stanza
6
+
7
+ from syntok.tokenizer import Tokenizer
8
+ import syntok.segmenter as syntok_segmenter
9
+
10
+ pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
11
+
12
+ nlp = spacy.blank('en')
13
+ nlp.add_pipe(nlp.create_pipe("sentencizer"))
14
+ nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
15
+ #stanza.download('en')
16
+ stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
17
+
18
+ syntok_tokenizer = Tokenizer()
19
+
20
+ def blingfire_tokenize(text):
21
+ return blingfire.text_to_sentences(text).split('\n')
22
+
23
+ def nltk_tokenize(text):
24
+ return nltk.sent_tokenize(text)
25
+
26
+ def pysbd_tokenize(text):
27
+ segments = pysbd_segmenter.segment(text)
28
+ segments = [s.strip() for s in segments]
29
+ return segments
30
+
31
+ def spacy_tokenize(text):
32
+ return [sent.text.strip("\n") for sent in nlp(text).sents]
33
+
34
+ def spacy_dep_tokenize(text):
35
+ return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
36
+
37
+ def stanza_tokenize(text):
38
+ return [e.text for e in stanza_nlp(text).sentences]
39
+
40
+ def make_sentences(segmented_tokens):
41
+ for sentence in segmented_tokens:
42
+ yield "".join(str(token) for token in sentence).strip()
43
+
44
+ def syntok_tokenize(text):
45
+ tokens = syntok_tokenizer.split(text)
46
+ result = syntok_segmenter.split(iter(tokens))
47
+ segments = [sent for sent in make_sentences(result)]
48
+ return segments
49
+
50
+ def speed_benchmark(big_text, tokenize_func):
51
+ segments = tokenize_func(big_text)
52
+ return segments
53
+
54
+ if __name__ == "__main__":
55
+ import time
56
+ libraries = (
57
+ blingfire_tokenize,
58
+ nltk_tokenize,
59
+ pysbd_tokenize,
60
+ spacy_tokenize,
61
+ spacy_dep_tokenize,
62
+ stanza_tokenize,
63
+ syntok_tokenize)
64
+
65
+ for tokenize_func in libraries:
66
+ t = time.time()
67
+ # wget http://www.gutenberg.org/files/1661/1661-0.txt -P benchmarks/
68
+ with open('benchmarks/1661-0.txt') as bigfile:
69
+ big_text = bigfile.read()
70
+ sentences = speed_benchmark(big_text, tokenize_func)
71
+
72
+ time_taken = time.time() - t
73
+ print()
74
+ print(tokenize_func.__name__)
75
+ print('Speed : {:>20.2f} ms'.format(time_taken * 1000))
@@ -0,0 +1,210 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ GOLDEN_EN_RULES = [
4
+ # 1) Simple period to end sentence
5
+ ("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
6
+ # 2) Question mark to end sentence
7
+ ("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
8
+ # 3) Exclamation point to end sentence
9
+ ("There it is! I found it.", ["There it is!", "I found it."]),
10
+ # 4) One letter upper case abbreviations
11
+ ("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]),
12
+ # 5) One letter lower case abbreviations
13
+ ("Please turn to p. 55.", ["Please turn to p. 55."]),
14
+ # 6) Two letter lower case abbreviations in the middle of a sentence
15
+ ("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]),
16
+ # 7) Two letter upper case abbreviations in the middle of a sentence
17
+ ("They closed the deal with Pitt, Briggs & Co. at noon.",
18
+ ["They closed the deal with Pitt, Briggs & Co. at noon."]),
19
+ # 8) Two letter lower case abbreviations at the end of a sentence
20
+ (
21
+ "Let's ask Jane and co. They should know.",
22
+ ["Let's ask Jane and co.", "They should know."]),
23
+ # 9) Two letter upper case abbreviations at the end of a sentence
24
+ (
25
+ "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", [
26
+ "They closed the deal with Pitt, Briggs & Co.",
27
+ "It closed yesterday."
28
+ ],
29
+ ),
30
+ # 10) Two letter (prepositive) abbreviations
31
+ ("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]),
32
+ # 11) Two letter (prepositive & postpositive) abbreviations
33
+ (
34
+ "St. Michael's Church is on 5th st. near the light.",
35
+ ["St. Michael's Church is on 5th st. near the light."],
36
+ ),
37
+ # 12) Possesive two letter abbreviations
38
+ ("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]),
39
+ # 13) Multi-period abbreviations in the middle of a sentence
40
+ ("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]),
41
+ # 14) Multi-period abbreviations at the end of a sentence
42
+ (
43
+ "I live in the E.U. How about you?",
44
+ ["I live in the E.U.", "How about you?"],
45
+ ),
46
+ # 15) U.S. as sentence boundary
47
+ (
48
+ "I live in the U.S. How about you?",
49
+ ["I live in the U.S.", "How about you?"],
50
+ ),
51
+ # 16) U.S. as non sentence boundary with next word capitalized
52
+ ("I work for the U.S. Government in Virginia.",
53
+ ["I work for the U.S. Government in Virginia."]),
54
+ # 17) U.S. as non sentence boundary
55
+ ("I have lived in the U.S. for 20 years.",
56
+ ["I have lived in the U.S. for 20 years."]),
57
+ # Most difficult sentence to crack
58
+ # 18) A.M. / P.M. as non sentence boundary and sentence boundary
59
+ (
60
+ "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.",
61
+ [
62
+ "At 5 a.m. Mr. Smith went to the bank.",
63
+ "He left the bank at 6 P.M.", "Mr. Smith then went to the store."
64
+ ]
65
+ ),
66
+ # 19) Number as non sentence boundary
67
+ ("She has $100.00 in her bag.", ["She has $100.00 in her bag."]),
68
+ # 20) Number as sentence boundary
69
+ ("She has $100.00. It is in her bag.", ["She has $100.00.", "It is in her bag."]),
70
+ # 21) Parenthetical inside sentence
71
+ ("He teaches science (He previously worked for 5 years as an engineer.) at the local University.",
72
+ ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
73
+ # 22) Email addresses
74
+ ("Her email is Jane.Doe@example.com. I sent her an email.",
75
+ ["Her email is Jane.Doe@example.com.", "I sent her an email."]),
76
+ # 23) Web addresses
77
+ ("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
78
+ ["The site is: https://www.example.50.com/new-site/awesome_content.html.",
79
+ "Please check it out."]),
80
+ # 24) Single quotations inside sentence
81
+ (
82
+ "She turned to him, 'This is great.' she said.",
83
+ ["She turned to him, 'This is great.' she said."],
84
+ ),
85
+ # 25) Double quotations inside sentence
86
+ (
87
+ 'She turned to him, "This is great." she said.',
88
+ ['She turned to him, "This is great." she said.'],
89
+ ),
90
+ # 26) Double quotations at the end of a sentence
91
+ (
92
+ 'She turned to him, "This is great." She held the book out to show him.',
93
+ [
94
+ 'She turned to him, "This is great."',
95
+ "She held the book out to show him."
96
+ ],
97
+ ),
98
+ # 27) Double punctuation (exclamation point)
99
+ ("Hello!! Long time no see.", ["Hello!!", "Long time no see."]),
100
+ # 28) Double punctuation (question mark)
101
+ ("Hello?? Who is there?", ["Hello??", "Who is there?"]),
102
+ # 29) Double punctuation (exclamation point / question mark)
103
+ ("Hello!? Is that you?", ["Hello!?", "Is that you?"]),
104
+ # 30) Double punctuation (question mark / exclamation point)
105
+ ("Hello?! Is that you?", ["Hello?!", "Is that you?"]),
106
+ # 31) List (period followed by parens and no period to end item)
107
+ (
108
+ "1.) The first item 2.) The second item",
109
+ ["1.) The first item", "2.) The second item"],
110
+ ),
111
+ # 32) List (period followed by parens and period to end item)
112
+ (
113
+ "1.) The first item. 2.) The second item.",
114
+ ["1.) The first item.", "2.) The second item."],
115
+ ),
116
+ # 33) List (parens and no period to end item)
117
+ (
118
+ "1) The first item 2) The second item",
119
+ ["1) The first item", "2) The second item"],
120
+ ),
121
+ # 34) List (parens and period to end item)
122
+ ("1) The first item. 2) The second item.",
123
+ ["1) The first item.", "2) The second item."]),
124
+ # 35) List (period to mark list and no period to end item)
125
+ (
126
+ "1. The first item 2. The second item",
127
+ ["1. The first item", "2. The second item"],
128
+ ),
129
+ # 36) List (period to mark list and period to end item)
130
+ (
131
+ "1. The first item. 2. The second item.",
132
+ ["1. The first item.", "2. The second item."],
133
+ ),
134
+ # 37) List with bullet
135
+ (
136
+ "• 9. The first item • 10. The second item",
137
+ ["• 9. The first item", "• 10. The second item"],
138
+ ),
139
+ # 38) List with hypthen
140
+ (
141
+ "⁃9. The first item ⁃10. The second item",
142
+ ["⁃9. The first item", "⁃10. The second item"],
143
+ ),
144
+ # 39) Alphabetical list
145
+ (
146
+ "a. The first item b. The second item c. The third list item",
147
+ ["a. The first item", "b. The second item", "c. The third list item"],
148
+ ),
149
+ # 40) Geo Coordinates
150
+ (
151
+ "You can find it at N°. 1026.253.553. That is where the treasure is.",
152
+ [
153
+ "You can find it at N°. 1026.253.553.",
154
+ "That is where the treasure is."
155
+ ],
156
+ ),
157
+ # 41) Named entities with an exclamation point
158
+ (
159
+ "She works at Yahoo! in the accounting department.",
160
+ ["She works at Yahoo! in the accounting department."],
161
+ ),
162
+ # 42) I as a sentence boundary and I as an abbreviation
163
+ (
164
+ "We make a good team, you and I. Did you see Albert I. Jones yesterday?",
165
+ [
166
+ "We make a good team, you and I.",
167
+ "Did you see Albert I. Jones yesterday?"
168
+ ],
169
+ ),
170
+ # 43) Ellipsis at end of quotation
171
+ (
172
+ "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”",
173
+ [
174
+ "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"
175
+ ],
176
+ ),
177
+ # 44) Ellipsis with square brackets
178
+ (
179
+ """"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""",
180
+ [
181
+ '"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).'
182
+ ],
183
+ ),
184
+ # 45) Ellipsis as sentence boundary (standard ellipsis rules)
185
+ ("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.",
186
+ [
187
+ "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .",
188
+ "Next sentence."
189
+ ]),
190
+ # 46) Ellipsis as sentence boundary (non-standard ellipsis rules)
191
+ (
192
+ "I never meant that.... She left the store.",
193
+ ["I never meant that....", "She left the store."],
194
+ ),
195
+ # 47) Ellipsis as non sentence boundary
196
+ (
197
+ "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.",
198
+ [
199
+ "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."
200
+ ],
201
+ ),
202
+ # 48) 4-dot ellipsis
203
+ (
204
+ "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .",
205
+ [
206
+ "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.",
207
+ ". . . The practice was not abandoned. . . ."
208
+ ],
209
+ )
210
+ ]