fast-sentence-segment 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,11 @@
2
2
  """CLI for fast-sentence-segment."""
3
3
 
4
4
  import argparse
5
+ import itertools
5
6
  import logging
6
7
  import os
7
8
  import sys
9
+ import threading
8
10
  import time
9
11
 
10
12
  from fast_sentence_segment import segment_text
@@ -21,6 +23,34 @@ YELLOW = "\033[33m"
21
23
  RESET = "\033[0m"
22
24
 
23
25
 
26
+ class Spinner:
27
+ """Animated spinner for long-running operations."""
28
+
29
+ def __init__(self, message: str):
30
+ self.message = message
31
+ self.running = False
32
+ self.thread = None
33
+ self.frames = itertools.cycle(["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"])
34
+
35
+ def _spin(self):
36
+ while self.running:
37
+ frame = next(self.frames)
38
+ print(f"\r {YELLOW}{frame}{RESET} {self.message}", end="", flush=True)
39
+ time.sleep(0.08)
40
+
41
+ def __enter__(self):
42
+ self.running = True
43
+ self.thread = threading.Thread(target=self._spin)
44
+ self.thread.start()
45
+ return self
46
+
47
+ def __exit__(self, *args):
48
+ self.running = False
49
+ if self.thread:
50
+ self.thread.join()
51
+ print(f"\r {' ' * (len(self.message) + 4)}\r", end="", flush=True)
52
+
53
+
24
54
  def _header(title: str):
25
55
  print(f"\n{BOLD}{CYAN}{title}{RESET}")
26
56
  print(f"{DIM}{'─' * 40}{RESET}")
@@ -113,35 +143,50 @@ def file_main():
113
143
  )
114
144
  args = parser.parse_args()
115
145
 
146
+ # Echo command immediately
116
147
  _header("segment-file")
148
+ print(f" {DIM}Segmenting text file into sentences{RESET}")
149
+ print()
150
+
151
+ # Show configuration
117
152
  _param("Input", args.input_file)
118
153
  _param("Output", args.output_file)
119
154
  _param("Size", _file_size(args.input_file))
120
- if args.unwrap:
121
- _param("Unwrap", "enabled")
122
-
123
- print(f"\n {YELLOW}Segmenting...{RESET}", end="", flush=True)
155
+ _param("Unwrap", "enabled" if args.unwrap else "disabled")
156
+ _param("Normalize quotes", "disabled" if args.no_normalize_quotes else "enabled")
157
+ print()
124
158
 
159
+ # Step 1: Read file
160
+ print(f" {YELLOW}→{RESET} Reading input file...")
125
161
  with open(args.input_file, "r", encoding="utf-8") as f:
126
162
  text = f.read()
163
+ print(f" {GREEN}✓{RESET} Read {len(text):,} characters")
127
164
 
165
+ # Step 2: Segment text
166
+ print(f" {YELLOW}→{RESET} Segmenting text...", end="", flush=True)
128
167
  start = time.perf_counter()
129
168
  normalize = not args.no_normalize_quotes
130
169
  sentences = segment_text(
131
170
  text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
132
171
  )
133
172
  elapsed = time.perf_counter() - start
173
+ print(f"\r {GREEN}✓{RESET} Segmented into {len(sentences):,} sentences ({elapsed:.2f}s)")
134
174
 
175
+ # Step 3: Write output
176
+ total = len(sentences)
135
177
  with open(args.output_file, "w", encoding="utf-8") as f:
136
178
  if args.unwrap:
137
179
  f.write(format_grouped_sentences(sentences) + "\n")
180
+ print(f" {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file}")
138
181
  else:
139
- for sentence in sentences:
182
+ for i, sentence in enumerate(sentences, 1):
140
183
  f.write(sentence + "\n")
184
+ if i % 500 == 0 or i == total:
185
+ pct = (i / total) * 100
186
+ print(f"\r {YELLOW}→{RESET} Writing... {pct:.0f}% ({i:,}/{total:,})", end="", flush=True)
187
+ print(f"\r {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file} ")
141
188
 
142
- print(f"\r {' ' * 20}\r", end="")
143
- _done(f"{len(sentences):,} sentences in {elapsed:.2f}s")
144
- _done(f"Written to {args.output_file}")
189
+ print(f"\n {GREEN}Done!{RESET}")
145
190
  print()
146
191
 
147
192
 
@@ -3,10 +3,46 @@
3
3
  """ Sentence Segmentation """
4
4
 
5
5
 
6
+ import subprocess
7
+ import sys
8
+
6
9
  import spacy
7
10
 
8
11
  from fast_sentence_segment.core import BaseObject
9
12
 
13
+
14
+ def _load_spacy_model(model_name: str = "en_core_web_sm"):
15
+ """Load spaCy model, auto-downloading if not found."""
16
+ # ANSI color codes
17
+ bold = "\033[1m"
18
+ cyan = "\033[36m"
19
+ green = "\033[32m"
20
+ yellow = "\033[33m"
21
+ reset = "\033[0m"
22
+
23
+ try:
24
+ return spacy.load(model_name)
25
+ except OSError:
26
+ print(f"\n{bold}{cyan}fast-sentence-segment{reset}", file=sys.stderr)
27
+ print(f"{'─' * 40}", file=sys.stderr)
28
+ print(
29
+ f" {yellow}⚠{reset} spaCy model '{model_name}' not found.",
30
+ file=sys.stderr,
31
+ )
32
+ print(f" {yellow}⏳{reset} Downloading model (one-time setup)...", file=sys.stderr)
33
+ print(file=sys.stderr)
34
+
35
+ subprocess.check_call(
36
+ [sys.executable, "-m", "spacy", "download", model_name],
37
+ )
38
+
39
+ print(file=sys.stderr)
40
+ print(f" {green}✓{reset} Model '{model_name}' installed successfully.", file=sys.stderr)
41
+ print(f" {green}✓{reset} You won't see this message again.", file=sys.stderr)
42
+ print(file=sys.stderr)
43
+
44
+ return spacy.load(model_name)
45
+
10
46
  from fast_sentence_segment.dmo import AbbreviationMerger
11
47
  from fast_sentence_segment.dmo import AbbreviationSplitter
12
48
  from fast_sentence_segment.dmo import TitleNameMerger
@@ -45,7 +81,7 @@ class PerformSentenceSegmentation(BaseObject):
45
81
  """
46
82
  BaseObject.__init__(self, __name__)
47
83
  if not self.__nlp:
48
- self.__nlp = spacy.load("en_core_web_sm")
84
+ self.__nlp = _load_spacy_model("en_core_web_sm")
49
85
 
50
86
  self._dehyphenate = Dehyphenator.process
51
87
  self._newlines_to_periods = NewlinesToPeriods.process
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fast-sentence-segment
3
- Version: 1.4.1
3
+ Version: 1.4.3
4
4
  Summary: Fast and Efficient Sentence Segmentation
5
5
  Home-page: https://github.com/craigtrim/fast-sentence-segment
6
6
  License: MIT
@@ -33,7 +33,7 @@ Description-Content-Type: text/markdown
33
33
 
34
34
  [![PyPI version](https://img.shields.io/pypi/v/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
35
35
  [![Python versions](https://img.shields.io/pypi/pyversions/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
36
- [![CI](https://img.shields.io/github/actions/workflow/status/craigtrim/fast-sentence-segment/ci.yml?branch=master&label=CI)](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
36
+ [![Tests](https://img.shields.io/badge/tests-664-brightgreen)](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
37
37
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
38
38
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
39
39
  [![Downloads](https://static.pepy.tech/badge/fast-sentence-segment)](https://pepy.tech/project/fast-sentence-segment)
@@ -1,7 +1,7 @@
1
1
  fast_sentence_segment/__init__.py,sha256=jeb4yCy89ivyqbo-4ldJLquPAG_XR_33Q7nrDjqPxvE,1465
2
2
  fast_sentence_segment/bp/__init__.py,sha256=j2-WfQ9WwVuXeGSjvV6XLVwEdvau8sdAQe4Pa4DrYi8,33
3
3
  fast_sentence_segment/bp/segmenter.py,sha256=UW6DguPgA56h-pPYRsfJhjIzBe40j6NdjkwYxamASyA,1928
4
- fast_sentence_segment/cli.py,sha256=Y89BH-xbJ0vykg301D2543MtGP4kYLnA6i3UQ7Hg5YA,3869
4
+ fast_sentence_segment/cli.py,sha256=vr1Gh-pq4bIPcnhUF6c7ckGdEfoyrI_r0XcrJrIfjEA,5640
5
5
  fast_sentence_segment/core/__init__.py,sha256=uoBersYyVStJ5a8zJpQz1GDGaloEdAv2jGHw1292hRM,108
6
6
  fast_sentence_segment/core/base_object.py,sha256=AYr7yzusIwawjbKdvcv4yTEnhmx6M583kDZzhzPOmq4,635
7
7
  fast_sentence_segment/core/stopwatch.py,sha256=hE6hMz2q6rduaKi58KZmiAL-lRtyh_wWCANhl4KLkRQ,879
@@ -24,9 +24,9 @@ fast_sentence_segment/dmo/title_name_merger.py,sha256=zbG04_VjwM8TtT8LhavvmZqIZL
24
24
  fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWvv6rb2mny8pnVd428oVZL0n5I,2457
25
25
  fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
26
26
  fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
27
- fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=Qaj7oxVHfUd6pIlwCD1O8P14LaGUdolFGuykmrF6gw8,6276
28
- fast_sentence_segment-1.4.1.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
29
- fast_sentence_segment-1.4.1.dist-info/METADATA,sha256=1movcAzZoI75xuEZwRZD0MauE-yW0vJzpfVQr_PSoW0,7854
30
- fast_sentence_segment-1.4.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
- fast_sentence_segment-1.4.1.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
32
- fast_sentence_segment-1.4.1.dist-info/RECORD,,
27
+ fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
28
+ fast_sentence_segment-1.4.3.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
29
+ fast_sentence_segment-1.4.3.dist-info/METADATA,sha256=5LGK9z9ip2AtOr2FgaIgkrR2mLvIQaeeuh8gVi3GBaA,7785
30
+ fast_sentence_segment-1.4.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
+ fast_sentence_segment-1.4.3.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
32
+ fast_sentence_segment-1.4.3.dist-info/RECORD,,