fast-sentence-segment 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,11 @@
2
2
  """CLI for fast-sentence-segment."""
3
3
 
4
4
  import argparse
5
+ import itertools
5
6
  import logging
6
7
  import os
7
8
  import sys
9
+ import threading
8
10
  import time
9
11
 
10
12
  from fast_sentence_segment import segment_text
@@ -21,6 +23,34 @@ YELLOW = "\033[33m"
21
23
  RESET = "\033[0m"
22
24
 
23
25
 
26
+ class Spinner:
27
+ """Animated spinner for long-running operations."""
28
+
29
+ def __init__(self, message: str):
30
+ self.message = message
31
+ self.running = False
32
+ self.thread = None
33
+ self.frames = itertools.cycle(["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"])
34
+
35
+ def _spin(self):
36
+ while self.running:
37
+ frame = next(self.frames)
38
+ print(f"\r {YELLOW}{frame}{RESET} {self.message}", end="", flush=True)
39
+ time.sleep(0.08)
40
+
41
+ def __enter__(self):
42
+ self.running = True
43
+ self.thread = threading.Thread(target=self._spin)
44
+ self.thread.start()
45
+ return self
46
+
47
+ def __exit__(self, *args):
48
+ self.running = False
49
+ if self.thread:
50
+ self.thread.join()
51
+ print(f"\r {' ' * (len(self.message) + 4)}\r", end="", flush=True)
52
+
53
+
24
54
  def _header(title: str):
25
55
  print(f"\n{BOLD}{CYAN}{title}{RESET}")
26
56
  print(f"{DIM}{'─' * 40}{RESET}")
@@ -117,29 +147,28 @@ def file_main():
117
147
  _param("Input", args.input_file)
118
148
  _param("Output", args.output_file)
119
149
  _param("Size", _file_size(args.input_file))
120
- if args.unwrap:
121
- _param("Unwrap", "enabled")
122
-
123
- print(f"\n {YELLOW}Segmenting...{RESET}", end="", flush=True)
150
+ _param("Unwrap", "enabled" if args.unwrap else "disabled")
151
+ _param("Normalize quotes", "disabled" if args.no_normalize_quotes else "enabled")
124
152
 
125
153
  with open(args.input_file, "r", encoding="utf-8") as f:
126
154
  text = f.read()
127
155
 
128
156
  start = time.perf_counter()
129
157
  normalize = not args.no_normalize_quotes
130
- sentences = segment_text(
131
- text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
132
- )
158
+ with Spinner("Segmenting text..."):
159
+ sentences = segment_text(
160
+ text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
161
+ )
133
162
  elapsed = time.perf_counter() - start
134
163
 
135
- with open(args.output_file, "w", encoding="utf-8") as f:
136
- if args.unwrap:
137
- f.write(format_grouped_sentences(sentences) + "\n")
138
- else:
139
- for sentence in sentences:
140
- f.write(sentence + "\n")
164
+ with Spinner("Writing output..."):
165
+ with open(args.output_file, "w", encoding="utf-8") as f:
166
+ if args.unwrap:
167
+ f.write(format_grouped_sentences(sentences) + "\n")
168
+ else:
169
+ for sentence in sentences:
170
+ f.write(sentence + "\n")
141
171
 
142
- print(f"\r {' ' * 20}\r", end="")
143
172
  _done(f"{len(sentences):,} sentences in {elapsed:.2f}s")
144
173
  _done(f"Written to {args.output_file}")
145
174
  print()
@@ -3,10 +3,46 @@
3
3
  """ Sentence Segmentation """
4
4
 
5
5
 
6
+ import subprocess
7
+ import sys
8
+
6
9
  import spacy
7
10
 
8
11
  from fast_sentence_segment.core import BaseObject
9
12
 
13
+
14
+ def _load_spacy_model(model_name: str = "en_core_web_sm"):
15
+ """Load spaCy model, auto-downloading if not found."""
16
+ # ANSI color codes
17
+ bold = "\033[1m"
18
+ cyan = "\033[36m"
19
+ green = "\033[32m"
20
+ yellow = "\033[33m"
21
+ reset = "\033[0m"
22
+
23
+ try:
24
+ return spacy.load(model_name)
25
+ except OSError:
26
+ print(f"\n{bold}{cyan}fast-sentence-segment{reset}", file=sys.stderr)
27
+ print(f"{'─' * 40}", file=sys.stderr)
28
+ print(
29
+ f" {yellow}⚠{reset} spaCy model '{model_name}' not found.",
30
+ file=sys.stderr,
31
+ )
32
+ print(f" {yellow}⏳{reset} Downloading model (one-time setup)...", file=sys.stderr)
33
+ print(file=sys.stderr)
34
+
35
+ subprocess.check_call(
36
+ [sys.executable, "-m", "spacy", "download", model_name],
37
+ )
38
+
39
+ print(file=sys.stderr)
40
+ print(f" {green}✓{reset} Model '{model_name}' installed successfully.", file=sys.stderr)
41
+ print(f" {green}✓{reset} You won't see this message again.", file=sys.stderr)
42
+ print(file=sys.stderr)
43
+
44
+ return spacy.load(model_name)
45
+
10
46
  from fast_sentence_segment.dmo import AbbreviationMerger
11
47
  from fast_sentence_segment.dmo import AbbreviationSplitter
12
48
  from fast_sentence_segment.dmo import TitleNameMerger
@@ -45,7 +81,7 @@ class PerformSentenceSegmentation(BaseObject):
45
81
  """
46
82
  BaseObject.__init__(self, __name__)
47
83
  if not self.__nlp:
48
- self.__nlp = spacy.load("en_core_web_sm")
84
+ self.__nlp = _load_spacy_model("en_core_web_sm")
49
85
 
50
86
  self._dehyphenate = Dehyphenator.process
51
87
  self._newlines_to_periods = NewlinesToPeriods.process
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fast-sentence-segment
3
- Version: 1.4.0
3
+ Version: 1.4.2
4
4
  Summary: Fast and Efficient Sentence Segmentation
5
5
  Home-page: https://github.com/craigtrim/fast-sentence-segment
6
6
  License: MIT
@@ -9,7 +9,7 @@ Author: Craig Trim
9
9
  Author-email: craigtrim@gmail.com
10
10
  Maintainer: Craig Trim
11
11
  Maintainer-email: craigtrim@gmail.com
12
- Requires-Python: >=3.9,<4.0
12
+ Requires-Python: >=3.9,<3.13
13
13
  Classifier: Development Status :: 5 - Production/Stable
14
14
  Classifier: Intended Audience :: Developers
15
15
  Classifier: Intended Audience :: Science/Research
@@ -34,6 +34,7 @@ Description-Content-Type: text/markdown
34
34
  [![PyPI version](https://img.shields.io/pypi/v/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
35
35
  [![Python versions](https://img.shields.io/pypi/pyversions/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
36
36
  [![CI](https://img.shields.io/github/actions/workflow/status/craigtrim/fast-sentence-segment/ci.yml?branch=master&label=CI)](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
37
+ [![Tests](https://img.shields.io/badge/tests-664-brightgreen)](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
37
38
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
38
39
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
39
40
  [![Downloads](https://static.pepy.tech/badge/fast-sentence-segment)](https://pepy.tech/project/fast-sentence-segment)
@@ -1,7 +1,7 @@
1
1
  fast_sentence_segment/__init__.py,sha256=jeb4yCy89ivyqbo-4ldJLquPAG_XR_33Q7nrDjqPxvE,1465
2
2
  fast_sentence_segment/bp/__init__.py,sha256=j2-WfQ9WwVuXeGSjvV6XLVwEdvau8sdAQe4Pa4DrYi8,33
3
3
  fast_sentence_segment/bp/segmenter.py,sha256=UW6DguPgA56h-pPYRsfJhjIzBe40j6NdjkwYxamASyA,1928
4
- fast_sentence_segment/cli.py,sha256=Y89BH-xbJ0vykg301D2543MtGP4kYLnA6i3UQ7Hg5YA,3869
4
+ fast_sentence_segment/cli.py,sha256=I5tLOnojPJLc-S3VHwQdSFON9DcuTjilwFRfwEpVKag,4866
5
5
  fast_sentence_segment/core/__init__.py,sha256=uoBersYyVStJ5a8zJpQz1GDGaloEdAv2jGHw1292hRM,108
6
6
  fast_sentence_segment/core/base_object.py,sha256=AYr7yzusIwawjbKdvcv4yTEnhmx6M583kDZzhzPOmq4,635
7
7
  fast_sentence_segment/core/stopwatch.py,sha256=hE6hMz2q6rduaKi58KZmiAL-lRtyh_wWCANhl4KLkRQ,879
@@ -24,9 +24,9 @@ fast_sentence_segment/dmo/title_name_merger.py,sha256=zbG04_VjwM8TtT8LhavvmZqIZL
24
24
  fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWvv6rb2mny8pnVd428oVZL0n5I,2457
25
25
  fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
26
26
  fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
27
- fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=Qaj7oxVHfUd6pIlwCD1O8P14LaGUdolFGuykmrF6gw8,6276
28
- fast_sentence_segment-1.4.0.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
29
- fast_sentence_segment-1.4.0.dist-info/METADATA,sha256=XernLjKJbfSBxiqhQm-SZC8JFtggSoPe1hUD2X6D9N8,7853
30
- fast_sentence_segment-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
- fast_sentence_segment-1.4.0.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
32
- fast_sentence_segment-1.4.0.dist-info/RECORD,,
27
+ fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
28
+ fast_sentence_segment-1.4.2.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
29
+ fast_sentence_segment-1.4.2.dist-info/METADATA,sha256=mUMWyQu_ec1Ugni5zFIHjUbMm4FYCypA1ws_NFhoZhM,7987
30
+ fast_sentence_segment-1.4.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
+ fast_sentence_segment-1.4.2.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
32
+ fast_sentence_segment-1.4.2.dist-info/RECORD,,