fast-sentence-segment 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_sentence_segment/cli.py +43 -14
- fast_sentence_segment/svc/perform_sentence_segmentation.py +37 -1
- {fast_sentence_segment-1.4.0.dist-info → fast_sentence_segment-1.4.2.dist-info}/METADATA +3 -2
- {fast_sentence_segment-1.4.0.dist-info → fast_sentence_segment-1.4.2.dist-info}/RECORD +7 -7
- {fast_sentence_segment-1.4.0.dist-info → fast_sentence_segment-1.4.2.dist-info}/LICENSE +0 -0
- {fast_sentence_segment-1.4.0.dist-info → fast_sentence_segment-1.4.2.dist-info}/WHEEL +0 -0
- {fast_sentence_segment-1.4.0.dist-info → fast_sentence_segment-1.4.2.dist-info}/entry_points.txt +0 -0
fast_sentence_segment/cli.py
CHANGED
|
@@ -2,9 +2,11 @@
|
|
|
2
2
|
"""CLI for fast-sentence-segment."""
|
|
3
3
|
|
|
4
4
|
import argparse
|
|
5
|
+
import itertools
|
|
5
6
|
import logging
|
|
6
7
|
import os
|
|
7
8
|
import sys
|
|
9
|
+
import threading
|
|
8
10
|
import time
|
|
9
11
|
|
|
10
12
|
from fast_sentence_segment import segment_text
|
|
@@ -21,6 +23,34 @@ YELLOW = "\033[33m"
|
|
|
21
23
|
RESET = "\033[0m"
|
|
22
24
|
|
|
23
25
|
|
|
26
|
+
class Spinner:
|
|
27
|
+
"""Animated spinner for long-running operations."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, message: str):
|
|
30
|
+
self.message = message
|
|
31
|
+
self.running = False
|
|
32
|
+
self.thread = None
|
|
33
|
+
self.frames = itertools.cycle(["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"])
|
|
34
|
+
|
|
35
|
+
def _spin(self):
|
|
36
|
+
while self.running:
|
|
37
|
+
frame = next(self.frames)
|
|
38
|
+
print(f"\r {YELLOW}{frame}{RESET} {self.message}", end="", flush=True)
|
|
39
|
+
time.sleep(0.08)
|
|
40
|
+
|
|
41
|
+
def __enter__(self):
|
|
42
|
+
self.running = True
|
|
43
|
+
self.thread = threading.Thread(target=self._spin)
|
|
44
|
+
self.thread.start()
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
def __exit__(self, *args):
|
|
48
|
+
self.running = False
|
|
49
|
+
if self.thread:
|
|
50
|
+
self.thread.join()
|
|
51
|
+
print(f"\r {' ' * (len(self.message) + 4)}\r", end="", flush=True)
|
|
52
|
+
|
|
53
|
+
|
|
24
54
|
def _header(title: str):
|
|
25
55
|
print(f"\n{BOLD}{CYAN}{title}{RESET}")
|
|
26
56
|
print(f"{DIM}{'─' * 40}{RESET}")
|
|
@@ -117,29 +147,28 @@ def file_main():
|
|
|
117
147
|
_param("Input", args.input_file)
|
|
118
148
|
_param("Output", args.output_file)
|
|
119
149
|
_param("Size", _file_size(args.input_file))
|
|
120
|
-
if args.unwrap
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
print(f"\n {YELLOW}Segmenting...{RESET}", end="", flush=True)
|
|
150
|
+
_param("Unwrap", "enabled" if args.unwrap else "disabled")
|
|
151
|
+
_param("Normalize quotes", "disabled" if args.no_normalize_quotes else "enabled")
|
|
124
152
|
|
|
125
153
|
with open(args.input_file, "r", encoding="utf-8") as f:
|
|
126
154
|
text = f.read()
|
|
127
155
|
|
|
128
156
|
start = time.perf_counter()
|
|
129
157
|
normalize = not args.no_normalize_quotes
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
158
|
+
with Spinner("Segmenting text..."):
|
|
159
|
+
sentences = segment_text(
|
|
160
|
+
text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
|
|
161
|
+
)
|
|
133
162
|
elapsed = time.perf_counter() - start
|
|
134
163
|
|
|
135
|
-
with
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
164
|
+
with Spinner("Writing output..."):
|
|
165
|
+
with open(args.output_file, "w", encoding="utf-8") as f:
|
|
166
|
+
if args.unwrap:
|
|
167
|
+
f.write(format_grouped_sentences(sentences) + "\n")
|
|
168
|
+
else:
|
|
169
|
+
for sentence in sentences:
|
|
170
|
+
f.write(sentence + "\n")
|
|
141
171
|
|
|
142
|
-
print(f"\r {' ' * 20}\r", end="")
|
|
143
172
|
_done(f"{len(sentences):,} sentences in {elapsed:.2f}s")
|
|
144
173
|
_done(f"Written to {args.output_file}")
|
|
145
174
|
print()
|
|
@@ -3,10 +3,46 @@
|
|
|
3
3
|
""" Sentence Segmentation """
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
|
|
6
9
|
import spacy
|
|
7
10
|
|
|
8
11
|
from fast_sentence_segment.core import BaseObject
|
|
9
12
|
|
|
13
|
+
|
|
14
|
+
def _load_spacy_model(model_name: str = "en_core_web_sm"):
|
|
15
|
+
"""Load spaCy model, auto-downloading if not found."""
|
|
16
|
+
# ANSI color codes
|
|
17
|
+
bold = "\033[1m"
|
|
18
|
+
cyan = "\033[36m"
|
|
19
|
+
green = "\033[32m"
|
|
20
|
+
yellow = "\033[33m"
|
|
21
|
+
reset = "\033[0m"
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
return spacy.load(model_name)
|
|
25
|
+
except OSError:
|
|
26
|
+
print(f"\n{bold}{cyan}fast-sentence-segment{reset}", file=sys.stderr)
|
|
27
|
+
print(f"{'─' * 40}", file=sys.stderr)
|
|
28
|
+
print(
|
|
29
|
+
f" {yellow}⚠{reset} spaCy model '{model_name}' not found.",
|
|
30
|
+
file=sys.stderr,
|
|
31
|
+
)
|
|
32
|
+
print(f" {yellow}⏳{reset} Downloading model (one-time setup)...", file=sys.stderr)
|
|
33
|
+
print(file=sys.stderr)
|
|
34
|
+
|
|
35
|
+
subprocess.check_call(
|
|
36
|
+
[sys.executable, "-m", "spacy", "download", model_name],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
print(file=sys.stderr)
|
|
40
|
+
print(f" {green}✓{reset} Model '{model_name}' installed successfully.", file=sys.stderr)
|
|
41
|
+
print(f" {green}✓{reset} You won't see this message again.", file=sys.stderr)
|
|
42
|
+
print(file=sys.stderr)
|
|
43
|
+
|
|
44
|
+
return spacy.load(model_name)
|
|
45
|
+
|
|
10
46
|
from fast_sentence_segment.dmo import AbbreviationMerger
|
|
11
47
|
from fast_sentence_segment.dmo import AbbreviationSplitter
|
|
12
48
|
from fast_sentence_segment.dmo import TitleNameMerger
|
|
@@ -45,7 +81,7 @@ class PerformSentenceSegmentation(BaseObject):
|
|
|
45
81
|
"""
|
|
46
82
|
BaseObject.__init__(self, __name__)
|
|
47
83
|
if not self.__nlp:
|
|
48
|
-
self.__nlp =
|
|
84
|
+
self.__nlp = _load_spacy_model("en_core_web_sm")
|
|
49
85
|
|
|
50
86
|
self._dehyphenate = Dehyphenator.process
|
|
51
87
|
self._newlines_to_periods = NewlinesToPeriods.process
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fast-sentence-segment
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.2
|
|
4
4
|
Summary: Fast and Efficient Sentence Segmentation
|
|
5
5
|
Home-page: https://github.com/craigtrim/fast-sentence-segment
|
|
6
6
|
License: MIT
|
|
@@ -9,7 +9,7 @@ Author: Craig Trim
|
|
|
9
9
|
Author-email: craigtrim@gmail.com
|
|
10
10
|
Maintainer: Craig Trim
|
|
11
11
|
Maintainer-email: craigtrim@gmail.com
|
|
12
|
-
Requires-Python: >=3.9,<
|
|
12
|
+
Requires-Python: >=3.9,<3.13
|
|
13
13
|
Classifier: Development Status :: 5 - Production/Stable
|
|
14
14
|
Classifier: Intended Audience :: Developers
|
|
15
15
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -34,6 +34,7 @@ Description-Content-Type: text/markdown
|
|
|
34
34
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
35
35
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
36
36
|
[](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
|
|
37
|
+
[](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
|
|
37
38
|
[](https://opensource.org/licenses/MIT)
|
|
38
39
|
[](https://github.com/astral-sh/ruff)
|
|
39
40
|
[](https://pepy.tech/project/fast-sentence-segment)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
fast_sentence_segment/__init__.py,sha256=jeb4yCy89ivyqbo-4ldJLquPAG_XR_33Q7nrDjqPxvE,1465
|
|
2
2
|
fast_sentence_segment/bp/__init__.py,sha256=j2-WfQ9WwVuXeGSjvV6XLVwEdvau8sdAQe4Pa4DrYi8,33
|
|
3
3
|
fast_sentence_segment/bp/segmenter.py,sha256=UW6DguPgA56h-pPYRsfJhjIzBe40j6NdjkwYxamASyA,1928
|
|
4
|
-
fast_sentence_segment/cli.py,sha256=
|
|
4
|
+
fast_sentence_segment/cli.py,sha256=I5tLOnojPJLc-S3VHwQdSFON9DcuTjilwFRfwEpVKag,4866
|
|
5
5
|
fast_sentence_segment/core/__init__.py,sha256=uoBersYyVStJ5a8zJpQz1GDGaloEdAv2jGHw1292hRM,108
|
|
6
6
|
fast_sentence_segment/core/base_object.py,sha256=AYr7yzusIwawjbKdvcv4yTEnhmx6M583kDZzhzPOmq4,635
|
|
7
7
|
fast_sentence_segment/core/stopwatch.py,sha256=hE6hMz2q6rduaKi58KZmiAL-lRtyh_wWCANhl4KLkRQ,879
|
|
@@ -24,9 +24,9 @@ fast_sentence_segment/dmo/title_name_merger.py,sha256=zbG04_VjwM8TtT8LhavvmZqIZL
|
|
|
24
24
|
fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWvv6rb2mny8pnVd428oVZL0n5I,2457
|
|
25
25
|
fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
|
|
26
26
|
fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
|
|
27
|
-
fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=
|
|
28
|
-
fast_sentence_segment-1.4.
|
|
29
|
-
fast_sentence_segment-1.4.
|
|
30
|
-
fast_sentence_segment-1.4.
|
|
31
|
-
fast_sentence_segment-1.4.
|
|
32
|
-
fast_sentence_segment-1.4.
|
|
27
|
+
fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
|
|
28
|
+
fast_sentence_segment-1.4.2.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
|
|
29
|
+
fast_sentence_segment-1.4.2.dist-info/METADATA,sha256=mUMWyQu_ec1Ugni5zFIHjUbMm4FYCypA1ws_NFhoZhM,7987
|
|
30
|
+
fast_sentence_segment-1.4.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
31
|
+
fast_sentence_segment-1.4.2.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
|
|
32
|
+
fast_sentence_segment-1.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{fast_sentence_segment-1.4.0.dist-info → fast_sentence_segment-1.4.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|