fast-sentence-segment 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_sentence_segment/cli.py +53 -8
- fast_sentence_segment/svc/perform_sentence_segmentation.py +37 -1
- {fast_sentence_segment-1.4.1.dist-info → fast_sentence_segment-1.4.3.dist-info}/METADATA +2 -2
- {fast_sentence_segment-1.4.1.dist-info → fast_sentence_segment-1.4.3.dist-info}/RECORD +7 -7
- {fast_sentence_segment-1.4.1.dist-info → fast_sentence_segment-1.4.3.dist-info}/LICENSE +0 -0
- {fast_sentence_segment-1.4.1.dist-info → fast_sentence_segment-1.4.3.dist-info}/WHEEL +0 -0
- {fast_sentence_segment-1.4.1.dist-info → fast_sentence_segment-1.4.3.dist-info}/entry_points.txt +0 -0
fast_sentence_segment/cli.py
CHANGED
|
@@ -2,9 +2,11 @@
|
|
|
2
2
|
"""CLI for fast-sentence-segment."""
|
|
3
3
|
|
|
4
4
|
import argparse
|
|
5
|
+
import itertools
|
|
5
6
|
import logging
|
|
6
7
|
import os
|
|
7
8
|
import sys
|
|
9
|
+
import threading
|
|
8
10
|
import time
|
|
9
11
|
|
|
10
12
|
from fast_sentence_segment import segment_text
|
|
@@ -21,6 +23,34 @@ YELLOW = "\033[33m"
|
|
|
21
23
|
RESET = "\033[0m"
|
|
22
24
|
|
|
23
25
|
|
|
26
|
+
class Spinner:
|
|
27
|
+
"""Animated spinner for long-running operations."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, message: str):
|
|
30
|
+
self.message = message
|
|
31
|
+
self.running = False
|
|
32
|
+
self.thread = None
|
|
33
|
+
self.frames = itertools.cycle(["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"])
|
|
34
|
+
|
|
35
|
+
def _spin(self):
|
|
36
|
+
while self.running:
|
|
37
|
+
frame = next(self.frames)
|
|
38
|
+
print(f"\r {YELLOW}{frame}{RESET} {self.message}", end="", flush=True)
|
|
39
|
+
time.sleep(0.08)
|
|
40
|
+
|
|
41
|
+
def __enter__(self):
|
|
42
|
+
self.running = True
|
|
43
|
+
self.thread = threading.Thread(target=self._spin)
|
|
44
|
+
self.thread.start()
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
def __exit__(self, *args):
|
|
48
|
+
self.running = False
|
|
49
|
+
if self.thread:
|
|
50
|
+
self.thread.join()
|
|
51
|
+
print(f"\r {' ' * (len(self.message) + 4)}\r", end="", flush=True)
|
|
52
|
+
|
|
53
|
+
|
|
24
54
|
def _header(title: str):
|
|
25
55
|
print(f"\n{BOLD}{CYAN}{title}{RESET}")
|
|
26
56
|
print(f"{DIM}{'─' * 40}{RESET}")
|
|
@@ -113,35 +143,50 @@ def file_main():
|
|
|
113
143
|
)
|
|
114
144
|
args = parser.parse_args()
|
|
115
145
|
|
|
146
|
+
# Echo command immediately
|
|
116
147
|
_header("segment-file")
|
|
148
|
+
print(f" {DIM}Segmenting text file into sentences{RESET}")
|
|
149
|
+
print()
|
|
150
|
+
|
|
151
|
+
# Show configuration
|
|
117
152
|
_param("Input", args.input_file)
|
|
118
153
|
_param("Output", args.output_file)
|
|
119
154
|
_param("Size", _file_size(args.input_file))
|
|
120
|
-
if args.unwrap
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
print(f"\n {YELLOW}Segmenting...{RESET}", end="", flush=True)
|
|
155
|
+
_param("Unwrap", "enabled" if args.unwrap else "disabled")
|
|
156
|
+
_param("Normalize quotes", "disabled" if args.no_normalize_quotes else "enabled")
|
|
157
|
+
print()
|
|
124
158
|
|
|
159
|
+
# Step 1: Read file
|
|
160
|
+
print(f" {YELLOW}→{RESET} Reading input file...")
|
|
125
161
|
with open(args.input_file, "r", encoding="utf-8") as f:
|
|
126
162
|
text = f.read()
|
|
163
|
+
print(f" {GREEN}✓{RESET} Read {len(text):,} characters")
|
|
127
164
|
|
|
165
|
+
# Step 2: Segment text
|
|
166
|
+
print(f" {YELLOW}→{RESET} Segmenting text...", end="", flush=True)
|
|
128
167
|
start = time.perf_counter()
|
|
129
168
|
normalize = not args.no_normalize_quotes
|
|
130
169
|
sentences = segment_text(
|
|
131
170
|
text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
|
|
132
171
|
)
|
|
133
172
|
elapsed = time.perf_counter() - start
|
|
173
|
+
print(f"\r {GREEN}✓{RESET} Segmented into {len(sentences):,} sentences ({elapsed:.2f}s)")
|
|
134
174
|
|
|
175
|
+
# Step 3: Write output
|
|
176
|
+
total = len(sentences)
|
|
135
177
|
with open(args.output_file, "w", encoding="utf-8") as f:
|
|
136
178
|
if args.unwrap:
|
|
137
179
|
f.write(format_grouped_sentences(sentences) + "\n")
|
|
180
|
+
print(f" {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file}")
|
|
138
181
|
else:
|
|
139
|
-
for sentence in sentences:
|
|
182
|
+
for i, sentence in enumerate(sentences, 1):
|
|
140
183
|
f.write(sentence + "\n")
|
|
184
|
+
if i % 500 == 0 or i == total:
|
|
185
|
+
pct = (i / total) * 100
|
|
186
|
+
print(f"\r {YELLOW}→{RESET} Writing... {pct:.0f}% ({i:,}/{total:,})", end="", flush=True)
|
|
187
|
+
print(f"\r {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file} ")
|
|
141
188
|
|
|
142
|
-
print(f"\
|
|
143
|
-
_done(f"{len(sentences):,} sentences in {elapsed:.2f}s")
|
|
144
|
-
_done(f"Written to {args.output_file}")
|
|
189
|
+
print(f"\n {GREEN}Done!{RESET}")
|
|
145
190
|
print()
|
|
146
191
|
|
|
147
192
|
|
|
@@ -3,10 +3,46 @@
|
|
|
3
3
|
""" Sentence Segmentation """
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
|
|
6
9
|
import spacy
|
|
7
10
|
|
|
8
11
|
from fast_sentence_segment.core import BaseObject
|
|
9
12
|
|
|
13
|
+
|
|
14
|
+
def _load_spacy_model(model_name: str = "en_core_web_sm"):
|
|
15
|
+
"""Load spaCy model, auto-downloading if not found."""
|
|
16
|
+
# ANSI color codes
|
|
17
|
+
bold = "\033[1m"
|
|
18
|
+
cyan = "\033[36m"
|
|
19
|
+
green = "\033[32m"
|
|
20
|
+
yellow = "\033[33m"
|
|
21
|
+
reset = "\033[0m"
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
return spacy.load(model_name)
|
|
25
|
+
except OSError:
|
|
26
|
+
print(f"\n{bold}{cyan}fast-sentence-segment{reset}", file=sys.stderr)
|
|
27
|
+
print(f"{'─' * 40}", file=sys.stderr)
|
|
28
|
+
print(
|
|
29
|
+
f" {yellow}⚠{reset} spaCy model '{model_name}' not found.",
|
|
30
|
+
file=sys.stderr,
|
|
31
|
+
)
|
|
32
|
+
print(f" {yellow}⏳{reset} Downloading model (one-time setup)...", file=sys.stderr)
|
|
33
|
+
print(file=sys.stderr)
|
|
34
|
+
|
|
35
|
+
subprocess.check_call(
|
|
36
|
+
[sys.executable, "-m", "spacy", "download", model_name],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
print(file=sys.stderr)
|
|
40
|
+
print(f" {green}✓{reset} Model '{model_name}' installed successfully.", file=sys.stderr)
|
|
41
|
+
print(f" {green}✓{reset} You won't see this message again.", file=sys.stderr)
|
|
42
|
+
print(file=sys.stderr)
|
|
43
|
+
|
|
44
|
+
return spacy.load(model_name)
|
|
45
|
+
|
|
10
46
|
from fast_sentence_segment.dmo import AbbreviationMerger
|
|
11
47
|
from fast_sentence_segment.dmo import AbbreviationSplitter
|
|
12
48
|
from fast_sentence_segment.dmo import TitleNameMerger
|
|
@@ -45,7 +81,7 @@ class PerformSentenceSegmentation(BaseObject):
|
|
|
45
81
|
"""
|
|
46
82
|
BaseObject.__init__(self, __name__)
|
|
47
83
|
if not self.__nlp:
|
|
48
|
-
self.__nlp =
|
|
84
|
+
self.__nlp = _load_spacy_model("en_core_web_sm")
|
|
49
85
|
|
|
50
86
|
self._dehyphenate = Dehyphenator.process
|
|
51
87
|
self._newlines_to_periods = NewlinesToPeriods.process
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fast-sentence-segment
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.3
|
|
4
4
|
Summary: Fast and Efficient Sentence Segmentation
|
|
5
5
|
Home-page: https://github.com/craigtrim/fast-sentence-segment
|
|
6
6
|
License: MIT
|
|
@@ -33,7 +33,7 @@ Description-Content-Type: text/markdown
|
|
|
33
33
|
|
|
34
34
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
35
35
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
36
|
-
[](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
|
|
37
37
|
[](https://opensource.org/licenses/MIT)
|
|
38
38
|
[](https://github.com/astral-sh/ruff)
|
|
39
39
|
[](https://pepy.tech/project/fast-sentence-segment)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
fast_sentence_segment/__init__.py,sha256=jeb4yCy89ivyqbo-4ldJLquPAG_XR_33Q7nrDjqPxvE,1465
|
|
2
2
|
fast_sentence_segment/bp/__init__.py,sha256=j2-WfQ9WwVuXeGSjvV6XLVwEdvau8sdAQe4Pa4DrYi8,33
|
|
3
3
|
fast_sentence_segment/bp/segmenter.py,sha256=UW6DguPgA56h-pPYRsfJhjIzBe40j6NdjkwYxamASyA,1928
|
|
4
|
-
fast_sentence_segment/cli.py,sha256=
|
|
4
|
+
fast_sentence_segment/cli.py,sha256=vr1Gh-pq4bIPcnhUF6c7ckGdEfoyrI_r0XcrJrIfjEA,5640
|
|
5
5
|
fast_sentence_segment/core/__init__.py,sha256=uoBersYyVStJ5a8zJpQz1GDGaloEdAv2jGHw1292hRM,108
|
|
6
6
|
fast_sentence_segment/core/base_object.py,sha256=AYr7yzusIwawjbKdvcv4yTEnhmx6M583kDZzhzPOmq4,635
|
|
7
7
|
fast_sentence_segment/core/stopwatch.py,sha256=hE6hMz2q6rduaKi58KZmiAL-lRtyh_wWCANhl4KLkRQ,879
|
|
@@ -24,9 +24,9 @@ fast_sentence_segment/dmo/title_name_merger.py,sha256=zbG04_VjwM8TtT8LhavvmZqIZL
|
|
|
24
24
|
fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWvv6rb2mny8pnVd428oVZL0n5I,2457
|
|
25
25
|
fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
|
|
26
26
|
fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
|
|
27
|
-
fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=
|
|
28
|
-
fast_sentence_segment-1.4.
|
|
29
|
-
fast_sentence_segment-1.4.
|
|
30
|
-
fast_sentence_segment-1.4.
|
|
31
|
-
fast_sentence_segment-1.4.
|
|
32
|
-
fast_sentence_segment-1.4.
|
|
27
|
+
fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
|
|
28
|
+
fast_sentence_segment-1.4.3.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
|
|
29
|
+
fast_sentence_segment-1.4.3.dist-info/METADATA,sha256=5LGK9z9ip2AtOr2FgaIgkrR2mLvIQaeeuh8gVi3GBaA,7785
|
|
30
|
+
fast_sentence_segment-1.4.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
31
|
+
fast_sentence_segment-1.4.3.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
|
|
32
|
+
fast_sentence_segment-1.4.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{fast_sentence_segment-1.4.1.dist-info → fast_sentence_segment-1.4.3.dist-info}/entry_points.txt
RENAMED
|
File without changes
|