PyPI - fast-sentence-segment - Versions diffs - 1.4.4__tar.gz → 1.5.3__tar.gz - Mend

fast-sentence-segment 1.4.4tar.gz → 1.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
-Metadata-Version: 2.4
+Metadata-Version: 2.1
 Name: fast-sentence-segment
-Version: 1.4.4
+Version: 1.5.3
 Summary: Fast and Efficient Sentence Segmentation
+Home-page: https://github.com/craigtrim/fast-sentence-segment
 License: MIT
-License-File: LICENSE
 Keywords: nlp,text,preprocess,segment
 Author: Craig Trim
 Author-email: craigtrim@gmail.com
@@ -33,7 +33,6 @@ Description-Content-Type: text/markdown
 [![PyPI version](https://img.shields.io/pypi/v/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
 [![Python versions](https://img.shields.io/pypi/pyversions/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
-[![CI](https://img.shields.io/github/actions/workflow/status/craigtrim/fast-sentence-segment/ci.yml?branch=master&label=CI)](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
 [![Tests](https://img.shields.io/badge/tests-664-brightgreen)](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
@@ -165,21 +164,24 @@ segment-file --input-file book.txt --output-file sentences.txt
 # Unwrap hard-wrapped e-texts (Project Gutenberg, etc.)
 segment-file --input-file book.txt --output-file sentences.txt --unwrap
+# Dialog-aware formatting (implies --unwrap)
+segment -f book.txt --format
 ```
 ## API Reference
 | Function | Parameters | Returns | Description |
 |----------|------------|---------|-------------|
-| `segment_text()` | `input_text: str`, `flatten: bool = False`, `unwrap: bool = False` | `list` | Main entry point for segmentation |
+| `segment_text()` | `input_text: str`, `flatten: bool = False`, `unwrap: bool = False`, `format: str = None` | `list` or `str` | Main entry point for segmentation. Use `format="dialog"` for dialog-aware output. |
 | `Segmenter.input_text()` | `input_text: str` | `list[list[str]]` | Cached paragraph-aware segmentation |
 ### CLI Commands
 | Command | Description |
 |---------|-------------|
-| `segment [text]` | Segment text from argument, `-f FILE`, or stdin. Use `-n` for numbered output. |
-| `segment-file --input-file IN --output-file OUT [--unwrap]` | Segment a file and write one sentence per line. Use `--unwrap` for hard-wrapped e-texts. |
+| `segment [text]` | Segment text from argument, `-f FILE`, or stdin. Use `-n` for numbered output, `--format` for dialog-aware paragraph grouping. |
+| `segment-file --input-file IN --output-file OUT [--unwrap] [--format]` | Segment a file and write one sentence per line. Use `--unwrap` for hard-wrapped e-texts, `--format` for dialog-aware formatting. |
 ## Why Nested Lists?

{fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/README.md RENAMED Viewed

@@ -2,7 +2,6 @@
 [![PyPI version](https://img.shields.io/pypi/v/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
 [![Python versions](https://img.shields.io/pypi/pyversions/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
-[![CI](https://img.shields.io/github/actions/workflow/status/craigtrim/fast-sentence-segment/ci.yml?branch=master&label=CI)](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
 [![Tests](https://img.shields.io/badge/tests-664-brightgreen)](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
@@ -134,21 +133,24 @@ segment-file --input-file book.txt --output-file sentences.txt
 # Unwrap hard-wrapped e-texts (Project Gutenberg, etc.)
 segment-file --input-file book.txt --output-file sentences.txt --unwrap
+# Dialog-aware formatting (implies --unwrap)
+segment -f book.txt --format
 ```
 ## API Reference
 | Function | Parameters | Returns | Description |
 |----------|------------|---------|-------------|
-| `segment_text()` | `input_text: str`, `flatten: bool = False`, `unwrap: bool = False` | `list` | Main entry point for segmentation |
+| `segment_text()` | `input_text: str`, `flatten: bool = False`, `unwrap: bool = False`, `format: str = None` | `list` or `str` | Main entry point for segmentation. Use `format="dialog"` for dialog-aware output. |
 | `Segmenter.input_text()` | `input_text: str` | `list[list[str]]` | Cached paragraph-aware segmentation |
 ### CLI Commands
 | Command | Description |
 |---------|-------------|
-| `segment [text]` | Segment text from argument, `-f FILE`, or stdin. Use `-n` for numbered output. |
-| `segment-file --input-file IN --output-file OUT [--unwrap]` | Segment a file and write one sentence per line. Use `--unwrap` for hard-wrapped e-texts. |
+| `segment [text]` | Segment text from argument, `-f FILE`, or stdin. Use `-n` for numbered output, `--format` for dialog-aware paragraph grouping. |
+| `segment-file --input-file IN --output-file OUT [--unwrap] [--format]` | Segment a file and write one sentence per line. Use `--unwrap` for hard-wrapped e-texts, `--format` for dialog-aware formatting. |
 ## Why Nested Lists?

{fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/__init__.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from typing import List, Optional, Union
 from .bp import *
 from .svc import *
 from .dmo import *
@@ -5,6 +7,7 @@ from .dmo import *
 from .bp.segmenter import Segmenter
 from .dmo.unwrap_hard_wrapped_text import unwrap_hard_wrapped_text
 from .dmo.normalize_quotes import normalize_quotes
+from .dmo.dialog_formatter import format_dialog
 segment = Segmenter().input_text
@@ -14,7 +17,8 @@ def segment_text(
     flatten: bool = False,
     unwrap: bool = False,
     normalize: bool = True,
-) -> list:
+    format: Optional[str] = None,
+) -> Union[List, str]:
     """Segment text into sentences.
     Args:
@@ -26,14 +30,23 @@ def segment_text(
         normalize: If True (default), normalize unicode quote variants
             to ASCII equivalents before segmenting. Ensures consistent
             quote characters for downstream processing.
+        format: Optional output format. Supported values:
+            - None (default): Return list of sentences/paragraphs
+            - "dialog": Return formatted string with dialog-aware
+              paragraph grouping (keeps multi-sentence quotes together,
+              adds paragraph breaks between speakers)
     Returns:
-        List of sentences (if flatten=True) or list of paragraph
-        groups, each containing a list of sentences.
+        If format is None: List of sentences (if flatten=True) or list
+        of paragraph groups, each containing a list of sentences.
+        If format="dialog": Formatted string with paragraph breaks.
-    Related GitHub Issue:
+    Related GitHub Issues:
         #6 - Review findings from Issue #5
         https://github.com/craigtrim/fast-sentence-segment/issues/6
+        #10 - feat: Add --format flag for dialog-aware paragraph formatting
+        https://github.com/craigtrim/fast-sentence-segment/issues/10
     """
     if unwrap:
         input_text = unwrap_hard_wrapped_text(input_text)
@@ -43,9 +56,15 @@ def segment_text(
     results = segment(input_text)
+    # Flatten to list of sentences
+    flat = []
+    [[flat.append(y) for y in x] for x in results]
+    # Apply formatting if requested
+    if format == "dialog":
+        return format_dialog(flat)
     if flatten:
-        flat = []
-        [[flat.append(y) for y in x] for x in results]
         return flat
     return results

fast_sentence_segment-1.5.3/fast_sentence_segment/cli.py ADDED Viewed

@@ -0,0 +1,306 @@
+# -*- coding: UTF-8 -*-
+"""CLI for fast-sentence-segment."""
+import argparse
+import itertools
+import logging
+import os
+import sys
+import threading
+import time
+from fast_sentence_segment import segment_text
+from fast_sentence_segment.dmo.group_quoted_sentences import format_grouped_sentences
+logging.disable(logging.CRITICAL)
+# ANSI color codes
+BOLD = "\033[1m"
+DIM = "\033[2m"
+CYAN = "\033[36m"
+GREEN = "\033[32m"
+YELLOW = "\033[33m"
+RESET = "\033[0m"
+class Spinner:
+    """Animated spinner for long-running operations."""
+    def __init__(self, message: str):
+        self.message = message
+        self.running = False
+        self.thread = None
+        self.frames = itertools.cycle(["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"])
+    def _spin(self):
+        while self.running:
+            frame = next(self.frames)
+            print(f"\r  {YELLOW}{frame}{RESET} {self.message}", end="", flush=True)
+            time.sleep(0.08)
+    def __enter__(self):
+        self.running = True
+        self.thread = threading.Thread(target=self._spin)
+        self.thread.start()
+        return self
+    def __exit__(self, *args):
+        self.running = False
+        if self.thread:
+            self.thread.join()
+        print(f"\r  {' ' * (len(self.message) + 4)}\r", end="", flush=True)
+def _header(title: str):
+    print(f"\n{BOLD}{CYAN}{title}{RESET}")
+    print(f"{DIM}{'─' * 40}{RESET}")
+def _param(label: str, value: str):
+    print(f"  {DIM}{label}:{RESET} {value}")
+def _done(msg: str):
+    print(f"\n  {GREEN}✓{RESET} {msg}")
+def _file_size(path: str) -> str:
+    size = os.path.getsize(path)
+    if size < 1024:
+        return f"{size} B"
+    elif size < 1024 * 1024:
+        return f"{size / 1024:.1f} KB"
+    return f"{size / (1024 * 1024):.1f} MB"
+def main():
+    parser = argparse.ArgumentParser(
+        prog="segment",
+        description="Segment text into sentences",
+    )
+    parser.add_argument(
+        "text",
+        nargs="?",
+        help="Text to segment (or use stdin)",
+    )
+    parser.add_argument(
+        "-f", "--file",
+        help="Read text from file",
+    )
+    parser.add_argument(
+        "-n", "--numbered",
+        action="store_true",
+        help="Number output lines",
+    )
+    parser.add_argument(
+        "--unwrap",
+        action="store_true",
+        help="Unwrap hard-wrapped lines and dehyphenate split words",
+    )
+    parser.add_argument(
+        "--format",
+        action="store_true",
+        help="Format output with dialog-aware paragraph grouping (implies --unwrap)",
+    )
+    args = parser.parse_args()
+    # --format implies --unwrap
+    unwrap = args.unwrap or args.format
+    # Get input text
+    if args.file:
+        with open(args.file, "r", encoding="utf-8") as f:
+            text = f.read()
+    elif args.text:
+        text = args.text
+    elif not sys.stdin.isatty():
+        text = sys.stdin.read()
+    else:
+        parser.print_help()
+        sys.exit(1)
+    # Segment and output
+    result = segment_text(
+        text.strip(), flatten=True, unwrap=unwrap,
+        format="dialog" if args.format else None
+    )
+    # If format is used, result is a string
+    if args.format:
+        print(result)
+    else:
+        # Result is a list of sentences
+        for i, sentence in enumerate(result, 1):
+            if args.numbered:
+                print(f"{i}. {sentence}")
+            else:
+                print(sentence)
+def _generate_output_path(input_path: str) -> str:
+    """Generate output path by inserting -clean before extension."""
+    base, ext = os.path.splitext(input_path)
+    return f"{base}-clean{ext}"
+def _process_single_file(
+    input_file: str, output_file: str, unwrap: bool, normalize: bool, format: str = None
+):
+    """Process a single file and write output."""
+    # Show configuration
+    _param("Input", input_file)
+    _param("Output", output_file)
+    _param("Size", _file_size(input_file))
+    _param("Unwrap", "enabled" if unwrap else "disabled")
+    _param("Normalize quotes", "disabled" if not normalize else "enabled")
+    _param("Format", format if format else "default (one sentence per line)")
+    print()
+    # Step 1: Read file
+    print(f"  {YELLOW}→{RESET} Reading input file...")
+    with open(input_file, "r", encoding="utf-8") as f:
+        text = f.read()
+    print(f"  {GREEN}✓{RESET} Read {len(text):,} characters")
+    # Step 2: Segment text
+    print(f"  {YELLOW}→{RESET} Segmenting text...", end="", flush=True)
+    start = time.perf_counter()
+    result = segment_text(
+        text.strip(), flatten=True, unwrap=unwrap, normalize=normalize, format=format,
+    )
+    elapsed = time.perf_counter() - start
+    # Step 3: Write output
+    if format:
+        # Format mode returns a string
+        print(f"\r  {GREEN}✓{RESET} Segmented text ({elapsed:.2f}s)")
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(result + "\n")
+        print(f"  {GREEN}✓{RESET} Written formatted output to {output_file}")
+    else:
+        # Default mode returns a list
+        sentences = result
+        print(f"\r  {GREEN}✓{RESET} Segmented into {len(sentences):,} sentences ({elapsed:.2f}s)")
+        total = len(sentences)
+        with open(output_file, "w", encoding="utf-8") as f:
+            if unwrap:
+                f.write(format_grouped_sentences(sentences) + "\n")
+                print(f"  {GREEN}✓{RESET} Written {total:,} sentences to {output_file}")
+            else:
+                for i, sentence in enumerate(sentences, 1):
+                    f.write(sentence + "\n")
+                    if i % 500 == 0 or i == total:
+                        pct = (i / total) * 100
+                        print(f"\r  {YELLOW}→{RESET} Writing... {pct:.0f}% ({i:,}/{total:,})", end="", flush=True)
+                print(f"\r  {GREEN}✓{RESET} Written {total:,} sentences to {output_file}       ")
+def file_main():
+    parser = argparse.ArgumentParser(
+        prog="segment-file",
+        description="Segment a text file into sentences and write to an output file",
+    )
+    parser.add_argument(
+        "--input-file",
+        help="Path to input text file",
+    )
+    parser.add_argument(
+        "--input-dir",
+        help="Path to directory containing text files to process",
+    )
+    parser.add_argument(
+        "--output-file",
+        help="Path to output file (optional, defaults to input-file with -clean suffix)",
+    )
+    parser.add_argument(
+        "--unwrap", action="store_true",
+        help="Unwrap hard-wrapped lines (e.g., Project Gutenberg e-texts)",
+    )
+    parser.add_argument(
+        "--no-normalize-quotes", action="store_true",
+        help="Disable unicode quote normalization to ASCII equivalents",
+    )
+    parser.add_argument(
+        "--format",
+        action="store_true",
+        help="Format output with dialog-aware paragraph grouping (implies --unwrap)",
+    )
+    args = parser.parse_args()
+    # --format implies --unwrap
+    unwrap = args.unwrap or args.format
+    # Validate arguments
+    if not args.input_file and not args.input_dir:
+        print(f"  {YELLOW}Error:{RESET} Either --input-file or --input-dir is required")
+        sys.exit(1)
+    if args.input_file and args.input_dir:
+        print(f"  {YELLOW}Error:{RESET} Cannot specify both --input-file and --input-dir")
+        sys.exit(1)
+    if args.input_dir and args.output_file:
+        print(f"  {YELLOW}Error:{RESET} --output-file cannot be used with --input-dir")
+        sys.exit(1)
+    normalize = not args.no_normalize_quotes
+    # Process directory
+    if args.input_dir:
+        input_dir = os.path.expanduser(args.input_dir)
+        if not os.path.isdir(input_dir):
+            print(f"  {YELLOW}Error:{RESET} Directory not found: {input_dir}")
+            sys.exit(1)
+        # Find all .txt files
+        txt_files = sorted([
+            f for f in os.listdir(input_dir)
+            if f.endswith(".txt") and not f.endswith("-clean.txt")
+        ])
+        if not txt_files:
+            print(f"  {YELLOW}Error:{RESET} No .txt files found in {input_dir}")
+            sys.exit(1)
+        _header("segment-file (batch)")
+        print(f"  {DIM}Processing {len(txt_files)} files in directory{RESET}")
+        print()
+        _param("Directory", input_dir)
+        _param("Files", str(len(txt_files)))
+        _param("Unwrap", "enabled" if unwrap else "disabled")
+        _param("Normalize quotes", "disabled" if not normalize else "enabled")
+        _param("Format", "dialog" if args.format else "default (one sentence per line)")
+        print()
+        format_value = "dialog" if args.format else None
+        for i, filename in enumerate(txt_files, 1):
+            input_path = os.path.join(input_dir, filename)
+            output_path = _generate_output_path(input_path)
+            print(f"  {BOLD}[{i}/{len(txt_files)}]{RESET} {filename}")
+            _process_single_file(input_path, output_path, unwrap, normalize, format_value)
+            print()
+        print(f"  {GREEN}Done! Processed {len(txt_files)} files.{RESET}")
+        print()
+        return
+    # Process single file
+    input_file = os.path.expanduser(args.input_file)
+    if not os.path.isfile(input_file):
+        print(f"  {YELLOW}Error:{RESET} File not found: {input_file}")
+        sys.exit(1)
+    output_file = args.output_file or _generate_output_path(input_file)
+    output_file = os.path.expanduser(output_file)
+    _header("segment-file")
+    print(f"  {DIM}Segmenting text file into sentences{RESET}")
+    print()
+    format_value = "dialog" if args.format else None
+    _process_single_file(input_file, output_file, unwrap, normalize, format_value)
+    print(f"\n  {GREEN}Done!{RESET}")
+    print()
+if __name__ == "__main__":
+    main()

{fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/__init__.py RENAMED Viewed

@@ -13,3 +13,5 @@ from .unwrap_hard_wrapped_text import unwrap_hard_wrapped_text
 from .normalize_quotes import normalize_quotes
 from .group_quoted_sentences import group_quoted_sentences, format_grouped_sentences
 from .strip_trailing_period_after_quote import StripTrailingPeriodAfterQuote
+from .ocr_artifact_fixer import OcrArtifactFixer
+from .dialog_formatter import DialogFormatter, format_dialog

fast-sentence-segment 1.4.4__tar.gz → 1.5.3__tar.gz

fast-sentence-segment 1.4.4tar.gz → 1.5.3tar.gz