PyPI - fast-sentence-segment - Versions diffs - 1.4.2__py3-none-any.whl → 1.4.4__py3-none-any.whl - Mend

fast-sentence-segment 1.4.2py3-none-any.whl → 1.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

fast_sentence_segment/cli.py CHANGED Viewed

@@ -143,34 +143,50 @@ def file_main():
     )
     args = parser.parse_args()
+    # Echo command immediately
     _header("segment-file")
+    print(f"  {DIM}Segmenting text file into sentences{RESET}")
+    print()
+    # Show configuration
     _param("Input", args.input_file)
     _param("Output", args.output_file)
     _param("Size", _file_size(args.input_file))
     _param("Unwrap", "enabled" if args.unwrap else "disabled")
     _param("Normalize quotes", "disabled" if args.no_normalize_quotes else "enabled")
+    print()
+    # Step 1: Read file
+    print(f"  {YELLOW}→{RESET} Reading input file...")
     with open(args.input_file, "r", encoding="utf-8") as f:
         text = f.read()
+    print(f"  {GREEN}✓{RESET} Read {len(text):,} characters")
+    # Step 2: Segment text
+    print(f"  {YELLOW}→{RESET} Segmenting text...", end="", flush=True)
     start = time.perf_counter()
     normalize = not args.no_normalize_quotes
-    with Spinner("Segmenting text..."):
-        sentences = segment_text(
-            text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
-        )
+    sentences = segment_text(
+        text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
+    )
     elapsed = time.perf_counter() - start
-    with Spinner("Writing output..."):
-        with open(args.output_file, "w", encoding="utf-8") as f:
-            if args.unwrap:
-                f.write(format_grouped_sentences(sentences) + "\n")
-            else:
-                for sentence in sentences:
-                    f.write(sentence + "\n")
-    _done(f"{len(sentences):,} sentences in {elapsed:.2f}s")
-    _done(f"Written to {args.output_file}")
+    print(f"\r  {GREEN}✓{RESET} Segmented into {len(sentences):,} sentences ({elapsed:.2f}s)")
+    # Step 3: Write output
+    total = len(sentences)
+    with open(args.output_file, "w", encoding="utf-8") as f:
+        if args.unwrap:
+            f.write(format_grouped_sentences(sentences) + "\n")
+            print(f"  {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file}")
+        else:
+            for i, sentence in enumerate(sentences, 1):
+                f.write(sentence + "\n")
+                if i % 500 == 0 or i == total:
+                    pct = (i / total) * 100
+                    print(f"\r  {YELLOW}→{RESET} Writing... {pct:.0f}% ({i:,}/{total:,})", end="", flush=True)
+            print(f"\r  {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file}       ")
+    print(f"\n  {GREEN}Done!{RESET}")
     print()

fast_sentence_segment/dmo/spacy_doc_segmenter.py CHANGED Viewed

@@ -65,6 +65,54 @@ class SpacyDocSegmenter(BaseObject):
             return False
         return True
+    @staticmethod
+    def _merge_orphaned_quotes(sentences: list) -> list:
+        """Merge orphaned opening quotes with the following sentence.
+        spaCy sometimes splits on opening quotes, producing sentences like:
+            ["'", "Oh, the funeral..."]
+        This merges them into:
+            ["'Oh, the funeral..."]
+        Also handles trailing orphaned quotes that should belong to next sentence:
+            ["He said. '", "Hello!'"]
+        Becomes:
+            ["He said.", "'Hello!'"]
+        """
+        if not sentences:
+            return sentences
+        result = []
+        i = 0
+        while i < len(sentences):
+            sent = sentences[i]
+            # Check if this sentence is just an opening quote
+            if sent.strip() in ("'", '"', "'.", '".'):
+                # Merge with the next sentence if available
+                if i + 1 < len(sentences):
+                    quote_char = sent.strip().rstrip('.')
+                    result.append(quote_char + sentences[i + 1])
+                    i += 2
+                    continue
+            result.append(sent)
+            i += 1
+        # Second pass: handle trailing orphaned quotes
+        # Pattern: sentence ends with `. '` or `. "` - move quote to next sentence
+        fixed = []
+        for i, sent in enumerate(result):
+            # Check for trailing orphaned quote (`. '` or `? '` or `! '`)
+            if len(sent) >= 3 and sent[-2:] in (" '", ' "') and sent[-3] in '.?!':
+                # Strip the trailing quote
+                trailing_quote = sent[-1]
+                sent = sent[:-2]
+                # Prepend to next sentence if available
+                if i + 1 < len(result) and not result[i + 1].startswith(('"', "'")):
+                    result[i + 1] = trailing_quote + result[i + 1]
+            fixed.append(sent)
+        return fixed
     @staticmethod
     def _cleanse(sentences: list) -> str:
         sentences = [sent for sent in sentences
@@ -103,6 +151,9 @@ class SpacyDocSegmenter(BaseObject):
         sentences = [sent for sent in sentences if
                      sent and len(sent) and sent != 'None']
+        # Merge orphaned opening quotes with following sentence
+        sentences = self._merge_orphaned_quotes(sentences)
         sentences = [self._append_period(sent)
                      for sent in sentences]

{fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: fast-sentence-segment
-Version: 1.4.2
+Version: 1.4.4
 Summary: Fast and Efficient Sentence Segmentation
-Home-page: https://github.com/craigtrim/fast-sentence-segment
 License: MIT
+License-File: LICENSE
 Keywords: nlp,text,preprocess,segment
 Author: Craig Trim
 Author-email: craigtrim@gmail.com

{fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 fast_sentence_segment/__init__.py,sha256=jeb4yCy89ivyqbo-4ldJLquPAG_XR_33Q7nrDjqPxvE,1465
 fast_sentence_segment/bp/__init__.py,sha256=j2-WfQ9WwVuXeGSjvV6XLVwEdvau8sdAQe4Pa4DrYi8,33
 fast_sentence_segment/bp/segmenter.py,sha256=UW6DguPgA56h-pPYRsfJhjIzBe40j6NdjkwYxamASyA,1928
-fast_sentence_segment/cli.py,sha256=I5tLOnojPJLc-S3VHwQdSFON9DcuTjilwFRfwEpVKag,4866
+fast_sentence_segment/cli.py,sha256=vr1Gh-pq4bIPcnhUF6c7ckGdEfoyrI_r0XcrJrIfjEA,5640
 fast_sentence_segment/core/__init__.py,sha256=uoBersYyVStJ5a8zJpQz1GDGaloEdAv2jGHw1292hRM,108
 fast_sentence_segment/core/base_object.py,sha256=AYr7yzusIwawjbKdvcv4yTEnhmx6M583kDZzhzPOmq4,635
 fast_sentence_segment/core/stopwatch.py,sha256=hE6hMz2q6rduaKi58KZmiAL-lRtyh_wWCANhl4KLkRQ,879
@@ -18,15 +18,15 @@ fast_sentence_segment/dmo/normalize_quotes.py,sha256=mr53qo_tj_I9XzElOKjUQvCtDQh
 fast_sentence_segment/dmo/numbered_list_normalizer.py,sha256=q0sOCW8Jkn2vTXlUcVhmDvYES3yvJx1oUVl_8y7eL4E,1672
 fast_sentence_segment/dmo/post_process_sentences.py,sha256=5jxG3TmFjxIExMPLhnCB5JT1lXQvFU9r4qQGoATGrWk,916
 fast_sentence_segment/dmo/question_exclamation_splitter.py,sha256=cRsWRu8zb6wOWG-BjMahHfz4YGutKiV9lW7dE-q3tgc,2006
-fast_sentence_segment/dmo/spacy_doc_segmenter.py,sha256=_oTsrIL2rjysjt_8bPJVNTn230pUtL-geCC8g174iC4,3163
+fast_sentence_segment/dmo/spacy_doc_segmenter.py,sha256=Kb65TYMhrbpTYEey5vb7TyhCjUHVxmugHYIeKkntCwk,5147
 fast_sentence_segment/dmo/strip_trailing_period_after_quote.py,sha256=wYkoLy5XJKZIblJXBvDAB8-a81UTQOhOf2u91wjJWUw,2259
 fast_sentence_segment/dmo/title_name_merger.py,sha256=zbG04_VjwM8TtT8LhavvmZqIZL_2xgT2OTxWkK_Zt1s,5133
 fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWvv6rb2mny8pnVd428oVZL0n5I,2457
 fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
 fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
 fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
-fast_sentence_segment-1.4.2.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
-fast_sentence_segment-1.4.2.dist-info/METADATA,sha256=mUMWyQu_ec1Ugni5zFIHjUbMm4FYCypA1ws_NFhoZhM,7987
-fast_sentence_segment-1.4.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-fast_sentence_segment-1.4.2.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
-fast_sentence_segment-1.4.2.dist-info/RECORD,,
+fast_sentence_segment-1.4.4.dist-info/METADATA,sha256=8EZrIvdE8CWTW64_HqkMH8eF6CzXs6UDdaFjcce7LTA,7947
+fast_sentence_segment-1.4.4.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
+fast_sentence_segment-1.4.4.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
+fast_sentence_segment-1.4.4.dist-info/licenses/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
+fast_sentence_segment-1.4.4.dist-info/RECORD,,

{fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.9.0
+Generator: poetry-core 2.3.1
 Root-Is-Purelib: true
 Tag: py3-none-any

{fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

fast-sentence-segment 1.4.2__py3-none-any.whl → 1.4.4__py3-none-any.whl

fast-sentence-segment 1.4.2py3-none-any.whl → 1.4.4py3-none-any.whl