fast-sentence-segment 1.4.2__py3-none-any.whl → 1.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -143,34 +143,50 @@ def file_main():
143
143
  )
144
144
  args = parser.parse_args()
145
145
 
146
+ # Echo command immediately
146
147
  _header("segment-file")
148
+ print(f" {DIM}Segmenting text file into sentences{RESET}")
149
+ print()
150
+
151
+ # Show configuration
147
152
  _param("Input", args.input_file)
148
153
  _param("Output", args.output_file)
149
154
  _param("Size", _file_size(args.input_file))
150
155
  _param("Unwrap", "enabled" if args.unwrap else "disabled")
151
156
  _param("Normalize quotes", "disabled" if args.no_normalize_quotes else "enabled")
157
+ print()
152
158
 
159
+ # Step 1: Read file
160
+ print(f" {YELLOW}→{RESET} Reading input file...")
153
161
  with open(args.input_file, "r", encoding="utf-8") as f:
154
162
  text = f.read()
163
+ print(f" {GREEN}✓{RESET} Read {len(text):,} characters")
155
164
 
165
+ # Step 2: Segment text
166
+ print(f" {YELLOW}→{RESET} Segmenting text...", end="", flush=True)
156
167
  start = time.perf_counter()
157
168
  normalize = not args.no_normalize_quotes
158
- with Spinner("Segmenting text..."):
159
- sentences = segment_text(
160
- text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
161
- )
169
+ sentences = segment_text(
170
+ text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
171
+ )
162
172
  elapsed = time.perf_counter() - start
163
-
164
- with Spinner("Writing output..."):
165
- with open(args.output_file, "w", encoding="utf-8") as f:
166
- if args.unwrap:
167
- f.write(format_grouped_sentences(sentences) + "\n")
168
- else:
169
- for sentence in sentences:
170
- f.write(sentence + "\n")
171
-
172
- _done(f"{len(sentences):,} sentences in {elapsed:.2f}s")
173
- _done(f"Written to {args.output_file}")
173
+ print(f"\r {GREEN}✓{RESET} Segmented into {len(sentences):,} sentences ({elapsed:.2f}s)")
174
+
175
+ # Step 3: Write output
176
+ total = len(sentences)
177
+ with open(args.output_file, "w", encoding="utf-8") as f:
178
+ if args.unwrap:
179
+ f.write(format_grouped_sentences(sentences) + "\n")
180
+ print(f" {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file}")
181
+ else:
182
+ for i, sentence in enumerate(sentences, 1):
183
+ f.write(sentence + "\n")
184
+ if i % 500 == 0 or i == total:
185
+ pct = (i / total) * 100
186
+ print(f"\r {YELLOW}→{RESET} Writing... {pct:.0f}% ({i:,}/{total:,})", end="", flush=True)
187
+ print(f"\r {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file} ")
188
+
189
+ print(f"\n {GREEN}Done!{RESET}")
174
190
  print()
175
191
 
176
192
 
@@ -65,6 +65,54 @@ class SpacyDocSegmenter(BaseObject):
65
65
  return False
66
66
  return True
67
67
 
68
+ @staticmethod
69
+ def _merge_orphaned_quotes(sentences: list) -> list:
70
+ """Merge orphaned opening quotes with the following sentence.
71
+
72
+ spaCy sometimes splits on opening quotes, producing sentences like:
73
+ ["'", "Oh, the funeral..."]
74
+ This merges them into:
75
+ ["'Oh, the funeral..."]
76
+
77
+ Also handles trailing orphaned quotes that should belong to next sentence:
78
+ ["He said. '", "Hello!'"]
79
+ Becomes:
80
+ ["He said.", "'Hello!'"]
81
+ """
82
+ if not sentences:
83
+ return sentences
84
+
85
+ result = []
86
+ i = 0
87
+ while i < len(sentences):
88
+ sent = sentences[i]
89
+ # Check if this sentence is just an opening quote
90
+ if sent.strip() in ("'", '"', "'.", '".'):
91
+ # Merge with the next sentence if available
92
+ if i + 1 < len(sentences):
93
+ quote_char = sent.strip().rstrip('.')
94
+ result.append(quote_char + sentences[i + 1])
95
+ i += 2
96
+ continue
97
+ result.append(sent)
98
+ i += 1
99
+
100
+ # Second pass: handle trailing orphaned quotes
101
+ # Pattern: sentence ends with `. '` or `. "` - move quote to next sentence
102
+ fixed = []
103
+ for i, sent in enumerate(result):
104
+ # Check for trailing orphaned quote (`. '` or `? '` or `! '`)
105
+ if len(sent) >= 3 and sent[-2:] in (" '", ' "') and sent[-3] in '.?!':
106
+ # Strip the trailing quote
107
+ trailing_quote = sent[-1]
108
+ sent = sent[:-2]
109
+ # Prepend to next sentence if available
110
+ if i + 1 < len(result) and not result[i + 1].startswith(('"', "'")):
111
+ result[i + 1] = trailing_quote + result[i + 1]
112
+ fixed.append(sent)
113
+
114
+ return fixed
115
+
68
116
  @staticmethod
69
117
  def _cleanse(sentences: list) -> str:
70
118
  sentences = [sent for sent in sentences
@@ -103,6 +151,9 @@ class SpacyDocSegmenter(BaseObject):
103
151
  sentences = [sent for sent in sentences if
104
152
  sent and len(sent) and sent != 'None']
105
153
 
154
+ # Merge orphaned opening quotes with following sentence
155
+ sentences = self._merge_orphaned_quotes(sentences)
156
+
106
157
  sentences = [self._append_period(sent)
107
158
  for sent in sentences]
108
159
 
@@ -1,9 +1,9 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: fast-sentence-segment
3
- Version: 1.4.2
3
+ Version: 1.4.4
4
4
  Summary: Fast and Efficient Sentence Segmentation
5
- Home-page: https://github.com/craigtrim/fast-sentence-segment
6
5
  License: MIT
6
+ License-File: LICENSE
7
7
  Keywords: nlp,text,preprocess,segment
8
8
  Author: Craig Trim
9
9
  Author-email: craigtrim@gmail.com
@@ -1,7 +1,7 @@
1
1
  fast_sentence_segment/__init__.py,sha256=jeb4yCy89ivyqbo-4ldJLquPAG_XR_33Q7nrDjqPxvE,1465
2
2
  fast_sentence_segment/bp/__init__.py,sha256=j2-WfQ9WwVuXeGSjvV6XLVwEdvau8sdAQe4Pa4DrYi8,33
3
3
  fast_sentence_segment/bp/segmenter.py,sha256=UW6DguPgA56h-pPYRsfJhjIzBe40j6NdjkwYxamASyA,1928
4
- fast_sentence_segment/cli.py,sha256=I5tLOnojPJLc-S3VHwQdSFON9DcuTjilwFRfwEpVKag,4866
4
+ fast_sentence_segment/cli.py,sha256=vr1Gh-pq4bIPcnhUF6c7ckGdEfoyrI_r0XcrJrIfjEA,5640
5
5
  fast_sentence_segment/core/__init__.py,sha256=uoBersYyVStJ5a8zJpQz1GDGaloEdAv2jGHw1292hRM,108
6
6
  fast_sentence_segment/core/base_object.py,sha256=AYr7yzusIwawjbKdvcv4yTEnhmx6M583kDZzhzPOmq4,635
7
7
  fast_sentence_segment/core/stopwatch.py,sha256=hE6hMz2q6rduaKi58KZmiAL-lRtyh_wWCANhl4KLkRQ,879
@@ -18,15 +18,15 @@ fast_sentence_segment/dmo/normalize_quotes.py,sha256=mr53qo_tj_I9XzElOKjUQvCtDQh
18
18
  fast_sentence_segment/dmo/numbered_list_normalizer.py,sha256=q0sOCW8Jkn2vTXlUcVhmDvYES3yvJx1oUVl_8y7eL4E,1672
19
19
  fast_sentence_segment/dmo/post_process_sentences.py,sha256=5jxG3TmFjxIExMPLhnCB5JT1lXQvFU9r4qQGoATGrWk,916
20
20
  fast_sentence_segment/dmo/question_exclamation_splitter.py,sha256=cRsWRu8zb6wOWG-BjMahHfz4YGutKiV9lW7dE-q3tgc,2006
21
- fast_sentence_segment/dmo/spacy_doc_segmenter.py,sha256=_oTsrIL2rjysjt_8bPJVNTn230pUtL-geCC8g174iC4,3163
21
+ fast_sentence_segment/dmo/spacy_doc_segmenter.py,sha256=Kb65TYMhrbpTYEey5vb7TyhCjUHVxmugHYIeKkntCwk,5147
22
22
  fast_sentence_segment/dmo/strip_trailing_period_after_quote.py,sha256=wYkoLy5XJKZIblJXBvDAB8-a81UTQOhOf2u91wjJWUw,2259
23
23
  fast_sentence_segment/dmo/title_name_merger.py,sha256=zbG04_VjwM8TtT8LhavvmZqIZL_2xgT2OTxWkK_Zt1s,5133
24
24
  fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWvv6rb2mny8pnVd428oVZL0n5I,2457
25
25
  fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
26
26
  fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
27
27
  fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
28
- fast_sentence_segment-1.4.2.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
29
- fast_sentence_segment-1.4.2.dist-info/METADATA,sha256=mUMWyQu_ec1Ugni5zFIHjUbMm4FYCypA1ws_NFhoZhM,7987
30
- fast_sentence_segment-1.4.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
- fast_sentence_segment-1.4.2.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
32
- fast_sentence_segment-1.4.2.dist-info/RECORD,,
28
+ fast_sentence_segment-1.4.4.dist-info/METADATA,sha256=8EZrIvdE8CWTW64_HqkMH8eF6CzXs6UDdaFjcce7LTA,7947
29
+ fast_sentence_segment-1.4.4.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
30
+ fast_sentence_segment-1.4.4.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
31
+ fast_sentence_segment-1.4.4.dist-info/licenses/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
32
+ fast_sentence_segment-1.4.4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 2.3.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any