fast-sentence-segment 1.4.4__tar.gz → 1.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/PKG-INFO +9 -7
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/README.md +6 -4
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/__init__.py +25 -6
- fast_sentence_segment-1.5.3/fast_sentence_segment/cli.py +306 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/__init__.py +2 -0
- fast_sentence_segment-1.5.3/fast_sentence_segment/dmo/dialog_formatter.py +371 -0
- fast_sentence_segment-1.5.3/fast_sentence_segment/dmo/ocr_artifact_fixer.py +70 -0
- fast_sentence_segment-1.5.3/fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py +319 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/svc/perform_sentence_segmentation.py +18 -12
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/pyproject.toml +1 -1
- fast_sentence_segment-1.5.3/setup.py +39 -0
- fast_sentence_segment-1.4.4/fast_sentence_segment/cli.py +0 -194
- fast_sentence_segment-1.4.4/fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py +0 -75
- fast_sentence_segment-1.4.4/setup.py +0 -39
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/LICENSE +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/bp/__init__.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/bp/segmenter.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/core/__init__.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/core/base_object.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/core/stopwatch.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/abbreviation_merger.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/abbreviation_splitter.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/abbreviations.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/bullet_point_cleaner.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/dehyphenator.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/ellipsis_normalizer.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/group_quoted_sentences.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/newlines_to_periods.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/normalize_quotes.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/numbered_list_normalizer.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/post_process_sentences.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/question_exclamation_splitter.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/spacy_doc_segmenter.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/strip_trailing_period_after_quote.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/title_name_merger.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/svc/__init__.py +0 -0
- {fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/svc/perform_paragraph_segmentation.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: fast-sentence-segment
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.3
|
|
4
4
|
Summary: Fast and Efficient Sentence Segmentation
|
|
5
|
+
Home-page: https://github.com/craigtrim/fast-sentence-segment
|
|
5
6
|
License: MIT
|
|
6
|
-
License-File: LICENSE
|
|
7
7
|
Keywords: nlp,text,preprocess,segment
|
|
8
8
|
Author: Craig Trim
|
|
9
9
|
Author-email: craigtrim@gmail.com
|
|
@@ -33,7 +33,6 @@ Description-Content-Type: text/markdown
|
|
|
33
33
|
|
|
34
34
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
35
35
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
36
|
-
[](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
|
|
37
36
|
[](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
|
|
38
37
|
[](https://opensource.org/licenses/MIT)
|
|
39
38
|
[](https://github.com/astral-sh/ruff)
|
|
@@ -165,21 +164,24 @@ segment-file --input-file book.txt --output-file sentences.txt
|
|
|
165
164
|
|
|
166
165
|
# Unwrap hard-wrapped e-texts (Project Gutenberg, etc.)
|
|
167
166
|
segment-file --input-file book.txt --output-file sentences.txt --unwrap
|
|
167
|
+
|
|
168
|
+
# Dialog-aware formatting (implies --unwrap)
|
|
169
|
+
segment -f book.txt --format
|
|
168
170
|
```
|
|
169
171
|
|
|
170
172
|
## API Reference
|
|
171
173
|
|
|
172
174
|
| Function | Parameters | Returns | Description |
|
|
173
175
|
|----------|------------|---------|-------------|
|
|
174
|
-
| `segment_text()` | `input_text: str`, `flatten: bool = False`, `unwrap: bool = False` | `list` | Main entry point for segmentation |
|
|
176
|
+
| `segment_text()` | `input_text: str`, `flatten: bool = False`, `unwrap: bool = False`, `format: str = None` | `list` or `str` | Main entry point for segmentation. Use `format="dialog"` for dialog-aware output. |
|
|
175
177
|
| `Segmenter.input_text()` | `input_text: str` | `list[list[str]]` | Cached paragraph-aware segmentation |
|
|
176
178
|
|
|
177
179
|
### CLI Commands
|
|
178
180
|
|
|
179
181
|
| Command | Description |
|
|
180
182
|
|---------|-------------|
|
|
181
|
-
| `segment [text]` | Segment text from argument, `-f FILE`, or stdin. Use `-n` for numbered output. |
|
|
182
|
-
| `segment-file --input-file IN --output-file OUT [--unwrap]` | Segment a file and write one sentence per line. Use `--unwrap` for hard-wrapped e-texts. |
|
|
183
|
+
| `segment [text]` | Segment text from argument, `-f FILE`, or stdin. Use `-n` for numbered output, `--format` for dialog-aware paragraph grouping. |
|
|
184
|
+
| `segment-file --input-file IN --output-file OUT [--unwrap] [--format]` | Segment a file and write one sentence per line. Use `--unwrap` for hard-wrapped e-texts, `--format` for dialog-aware formatting. |
|
|
183
185
|
|
|
184
186
|
## Why Nested Lists?
|
|
185
187
|
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
4
4
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
5
|
-
[](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
|
|
6
5
|
[](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
|
|
7
6
|
[](https://opensource.org/licenses/MIT)
|
|
8
7
|
[](https://github.com/astral-sh/ruff)
|
|
@@ -134,21 +133,24 @@ segment-file --input-file book.txt --output-file sentences.txt
|
|
|
134
133
|
|
|
135
134
|
# Unwrap hard-wrapped e-texts (Project Gutenberg, etc.)
|
|
136
135
|
segment-file --input-file book.txt --output-file sentences.txt --unwrap
|
|
136
|
+
|
|
137
|
+
# Dialog-aware formatting (implies --unwrap)
|
|
138
|
+
segment -f book.txt --format
|
|
137
139
|
```
|
|
138
140
|
|
|
139
141
|
## API Reference
|
|
140
142
|
|
|
141
143
|
| Function | Parameters | Returns | Description |
|
|
142
144
|
|----------|------------|---------|-------------|
|
|
143
|
-
| `segment_text()` | `input_text: str`, `flatten: bool = False`, `unwrap: bool = False` | `list` | Main entry point for segmentation |
|
|
145
|
+
| `segment_text()` | `input_text: str`, `flatten: bool = False`, `unwrap: bool = False`, `format: str = None` | `list` or `str` | Main entry point for segmentation. Use `format="dialog"` for dialog-aware output. |
|
|
144
146
|
| `Segmenter.input_text()` | `input_text: str` | `list[list[str]]` | Cached paragraph-aware segmentation |
|
|
145
147
|
|
|
146
148
|
### CLI Commands
|
|
147
149
|
|
|
148
150
|
| Command | Description |
|
|
149
151
|
|---------|-------------|
|
|
150
|
-
| `segment [text]` | Segment text from argument, `-f FILE`, or stdin. Use `-n` for numbered output. |
|
|
151
|
-
| `segment-file --input-file IN --output-file OUT [--unwrap]` | Segment a file and write one sentence per line. Use `--unwrap` for hard-wrapped e-texts. |
|
|
152
|
+
| `segment [text]` | Segment text from argument, `-f FILE`, or stdin. Use `-n` for numbered output, `--format` for dialog-aware paragraph grouping. |
|
|
153
|
+
| `segment-file --input-file IN --output-file OUT [--unwrap] [--format]` | Segment a file and write one sentence per line. Use `--unwrap` for hard-wrapped e-texts, `--format` for dialog-aware formatting. |
|
|
152
154
|
|
|
153
155
|
## Why Nested Lists?
|
|
154
156
|
|
{fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/__init__.py
RENAMED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import List, Optional, Union
|
|
2
|
+
|
|
1
3
|
from .bp import *
|
|
2
4
|
from .svc import *
|
|
3
5
|
from .dmo import *
|
|
@@ -5,6 +7,7 @@ from .dmo import *
|
|
|
5
7
|
from .bp.segmenter import Segmenter
|
|
6
8
|
from .dmo.unwrap_hard_wrapped_text import unwrap_hard_wrapped_text
|
|
7
9
|
from .dmo.normalize_quotes import normalize_quotes
|
|
10
|
+
from .dmo.dialog_formatter import format_dialog
|
|
8
11
|
|
|
9
12
|
segment = Segmenter().input_text
|
|
10
13
|
|
|
@@ -14,7 +17,8 @@ def segment_text(
|
|
|
14
17
|
flatten: bool = False,
|
|
15
18
|
unwrap: bool = False,
|
|
16
19
|
normalize: bool = True,
|
|
17
|
-
|
|
20
|
+
format: Optional[str] = None,
|
|
21
|
+
) -> Union[List, str]:
|
|
18
22
|
"""Segment text into sentences.
|
|
19
23
|
|
|
20
24
|
Args:
|
|
@@ -26,14 +30,23 @@ def segment_text(
|
|
|
26
30
|
normalize: If True (default), normalize unicode quote variants
|
|
27
31
|
to ASCII equivalents before segmenting. Ensures consistent
|
|
28
32
|
quote characters for downstream processing.
|
|
33
|
+
format: Optional output format. Supported values:
|
|
34
|
+
- None (default): Return list of sentences/paragraphs
|
|
35
|
+
- "dialog": Return formatted string with dialog-aware
|
|
36
|
+
paragraph grouping (keeps multi-sentence quotes together,
|
|
37
|
+
adds paragraph breaks between speakers)
|
|
29
38
|
|
|
30
39
|
Returns:
|
|
31
|
-
List of sentences (if flatten=True) or list
|
|
32
|
-
groups, each containing a list of sentences.
|
|
40
|
+
If format is None: List of sentences (if flatten=True) or list
|
|
41
|
+
of paragraph groups, each containing a list of sentences.
|
|
42
|
+
If format="dialog": Formatted string with paragraph breaks.
|
|
33
43
|
|
|
34
|
-
Related GitHub
|
|
44
|
+
Related GitHub Issues:
|
|
35
45
|
#6 - Review findings from Issue #5
|
|
36
46
|
https://github.com/craigtrim/fast-sentence-segment/issues/6
|
|
47
|
+
|
|
48
|
+
#10 - feat: Add --format flag for dialog-aware paragraph formatting
|
|
49
|
+
https://github.com/craigtrim/fast-sentence-segment/issues/10
|
|
37
50
|
"""
|
|
38
51
|
if unwrap:
|
|
39
52
|
input_text = unwrap_hard_wrapped_text(input_text)
|
|
@@ -43,9 +56,15 @@ def segment_text(
|
|
|
43
56
|
|
|
44
57
|
results = segment(input_text)
|
|
45
58
|
|
|
59
|
+
# Flatten to list of sentences
|
|
60
|
+
flat = []
|
|
61
|
+
[[flat.append(y) for y in x] for x in results]
|
|
62
|
+
|
|
63
|
+
# Apply formatting if requested
|
|
64
|
+
if format == "dialog":
|
|
65
|
+
return format_dialog(flat)
|
|
66
|
+
|
|
46
67
|
if flatten:
|
|
47
|
-
flat = []
|
|
48
|
-
[[flat.append(y) for y in x] for x in results]
|
|
49
68
|
return flat
|
|
50
69
|
|
|
51
70
|
return results
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
"""CLI for fast-sentence-segment."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import itertools
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
from fast_sentence_segment import segment_text
|
|
13
|
+
from fast_sentence_segment.dmo.group_quoted_sentences import format_grouped_sentences
|
|
14
|
+
|
|
15
|
+
logging.disable(logging.CRITICAL)
|
|
16
|
+
|
|
17
|
+
# ANSI color codes
|
|
18
|
+
BOLD = "\033[1m"
|
|
19
|
+
DIM = "\033[2m"
|
|
20
|
+
CYAN = "\033[36m"
|
|
21
|
+
GREEN = "\033[32m"
|
|
22
|
+
YELLOW = "\033[33m"
|
|
23
|
+
RESET = "\033[0m"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Spinner:
|
|
27
|
+
"""Animated spinner for long-running operations."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, message: str):
|
|
30
|
+
self.message = message
|
|
31
|
+
self.running = False
|
|
32
|
+
self.thread = None
|
|
33
|
+
self.frames = itertools.cycle(["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"])
|
|
34
|
+
|
|
35
|
+
def _spin(self):
|
|
36
|
+
while self.running:
|
|
37
|
+
frame = next(self.frames)
|
|
38
|
+
print(f"\r {YELLOW}{frame}{RESET} {self.message}", end="", flush=True)
|
|
39
|
+
time.sleep(0.08)
|
|
40
|
+
|
|
41
|
+
def __enter__(self):
|
|
42
|
+
self.running = True
|
|
43
|
+
self.thread = threading.Thread(target=self._spin)
|
|
44
|
+
self.thread.start()
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
def __exit__(self, *args):
|
|
48
|
+
self.running = False
|
|
49
|
+
if self.thread:
|
|
50
|
+
self.thread.join()
|
|
51
|
+
print(f"\r {' ' * (len(self.message) + 4)}\r", end="", flush=True)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _header(title: str):
|
|
55
|
+
print(f"\n{BOLD}{CYAN}{title}{RESET}")
|
|
56
|
+
print(f"{DIM}{'─' * 40}{RESET}")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _param(label: str, value: str):
|
|
60
|
+
print(f" {DIM}{label}:{RESET} {value}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _done(msg: str):
|
|
64
|
+
print(f"\n {GREEN}✓{RESET} {msg}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _file_size(path: str) -> str:
|
|
68
|
+
size = os.path.getsize(path)
|
|
69
|
+
if size < 1024:
|
|
70
|
+
return f"{size} B"
|
|
71
|
+
elif size < 1024 * 1024:
|
|
72
|
+
return f"{size / 1024:.1f} KB"
|
|
73
|
+
return f"{size / (1024 * 1024):.1f} MB"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def main():
|
|
77
|
+
parser = argparse.ArgumentParser(
|
|
78
|
+
prog="segment",
|
|
79
|
+
description="Segment text into sentences",
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"text",
|
|
83
|
+
nargs="?",
|
|
84
|
+
help="Text to segment (or use stdin)",
|
|
85
|
+
)
|
|
86
|
+
parser.add_argument(
|
|
87
|
+
"-f", "--file",
|
|
88
|
+
help="Read text from file",
|
|
89
|
+
)
|
|
90
|
+
parser.add_argument(
|
|
91
|
+
"-n", "--numbered",
|
|
92
|
+
action="store_true",
|
|
93
|
+
help="Number output lines",
|
|
94
|
+
)
|
|
95
|
+
parser.add_argument(
|
|
96
|
+
"--unwrap",
|
|
97
|
+
action="store_true",
|
|
98
|
+
help="Unwrap hard-wrapped lines and dehyphenate split words",
|
|
99
|
+
)
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
"--format",
|
|
102
|
+
action="store_true",
|
|
103
|
+
help="Format output with dialog-aware paragraph grouping (implies --unwrap)",
|
|
104
|
+
)
|
|
105
|
+
args = parser.parse_args()
|
|
106
|
+
|
|
107
|
+
# --format implies --unwrap
|
|
108
|
+
unwrap = args.unwrap or args.format
|
|
109
|
+
|
|
110
|
+
# Get input text
|
|
111
|
+
if args.file:
|
|
112
|
+
with open(args.file, "r", encoding="utf-8") as f:
|
|
113
|
+
text = f.read()
|
|
114
|
+
elif args.text:
|
|
115
|
+
text = args.text
|
|
116
|
+
elif not sys.stdin.isatty():
|
|
117
|
+
text = sys.stdin.read()
|
|
118
|
+
else:
|
|
119
|
+
parser.print_help()
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
|
|
122
|
+
# Segment and output
|
|
123
|
+
result = segment_text(
|
|
124
|
+
text.strip(), flatten=True, unwrap=unwrap,
|
|
125
|
+
format="dialog" if args.format else None
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# If format is used, result is a string
|
|
129
|
+
if args.format:
|
|
130
|
+
print(result)
|
|
131
|
+
else:
|
|
132
|
+
# Result is a list of sentences
|
|
133
|
+
for i, sentence in enumerate(result, 1):
|
|
134
|
+
if args.numbered:
|
|
135
|
+
print(f"{i}. {sentence}")
|
|
136
|
+
else:
|
|
137
|
+
print(sentence)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _generate_output_path(input_path: str) -> str:
|
|
141
|
+
"""Generate output path by inserting -clean before extension."""
|
|
142
|
+
base, ext = os.path.splitext(input_path)
|
|
143
|
+
return f"{base}-clean{ext}"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _process_single_file(
|
|
147
|
+
input_file: str, output_file: str, unwrap: bool, normalize: bool, format: str = None
|
|
148
|
+
):
|
|
149
|
+
"""Process a single file and write output."""
|
|
150
|
+
# Show configuration
|
|
151
|
+
_param("Input", input_file)
|
|
152
|
+
_param("Output", output_file)
|
|
153
|
+
_param("Size", _file_size(input_file))
|
|
154
|
+
_param("Unwrap", "enabled" if unwrap else "disabled")
|
|
155
|
+
_param("Normalize quotes", "disabled" if not normalize else "enabled")
|
|
156
|
+
_param("Format", format if format else "default (one sentence per line)")
|
|
157
|
+
print()
|
|
158
|
+
|
|
159
|
+
# Step 1: Read file
|
|
160
|
+
print(f" {YELLOW}→{RESET} Reading input file...")
|
|
161
|
+
with open(input_file, "r", encoding="utf-8") as f:
|
|
162
|
+
text = f.read()
|
|
163
|
+
print(f" {GREEN}✓{RESET} Read {len(text):,} characters")
|
|
164
|
+
|
|
165
|
+
# Step 2: Segment text
|
|
166
|
+
print(f" {YELLOW}→{RESET} Segmenting text...", end="", flush=True)
|
|
167
|
+
start = time.perf_counter()
|
|
168
|
+
result = segment_text(
|
|
169
|
+
text.strip(), flatten=True, unwrap=unwrap, normalize=normalize, format=format,
|
|
170
|
+
)
|
|
171
|
+
elapsed = time.perf_counter() - start
|
|
172
|
+
|
|
173
|
+
# Step 3: Write output
|
|
174
|
+
if format:
|
|
175
|
+
# Format mode returns a string
|
|
176
|
+
print(f"\r {GREEN}✓{RESET} Segmented text ({elapsed:.2f}s)")
|
|
177
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
178
|
+
f.write(result + "\n")
|
|
179
|
+
print(f" {GREEN}✓{RESET} Written formatted output to {output_file}")
|
|
180
|
+
else:
|
|
181
|
+
# Default mode returns a list
|
|
182
|
+
sentences = result
|
|
183
|
+
print(f"\r {GREEN}✓{RESET} Segmented into {len(sentences):,} sentences ({elapsed:.2f}s)")
|
|
184
|
+
total = len(sentences)
|
|
185
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
186
|
+
if unwrap:
|
|
187
|
+
f.write(format_grouped_sentences(sentences) + "\n")
|
|
188
|
+
print(f" {GREEN}✓{RESET} Written {total:,} sentences to {output_file}")
|
|
189
|
+
else:
|
|
190
|
+
for i, sentence in enumerate(sentences, 1):
|
|
191
|
+
f.write(sentence + "\n")
|
|
192
|
+
if i % 500 == 0 or i == total:
|
|
193
|
+
pct = (i / total) * 100
|
|
194
|
+
print(f"\r {YELLOW}→{RESET} Writing... {pct:.0f}% ({i:,}/{total:,})", end="", flush=True)
|
|
195
|
+
print(f"\r {GREEN}✓{RESET} Written {total:,} sentences to {output_file} ")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def file_main():
|
|
199
|
+
parser = argparse.ArgumentParser(
|
|
200
|
+
prog="segment-file",
|
|
201
|
+
description="Segment a text file into sentences and write to an output file",
|
|
202
|
+
)
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
"--input-file",
|
|
205
|
+
help="Path to input text file",
|
|
206
|
+
)
|
|
207
|
+
parser.add_argument(
|
|
208
|
+
"--input-dir",
|
|
209
|
+
help="Path to directory containing text files to process",
|
|
210
|
+
)
|
|
211
|
+
parser.add_argument(
|
|
212
|
+
"--output-file",
|
|
213
|
+
help="Path to output file (optional, defaults to input-file with -clean suffix)",
|
|
214
|
+
)
|
|
215
|
+
parser.add_argument(
|
|
216
|
+
"--unwrap", action="store_true",
|
|
217
|
+
help="Unwrap hard-wrapped lines (e.g., Project Gutenberg e-texts)",
|
|
218
|
+
)
|
|
219
|
+
parser.add_argument(
|
|
220
|
+
"--no-normalize-quotes", action="store_true",
|
|
221
|
+
help="Disable unicode quote normalization to ASCII equivalents",
|
|
222
|
+
)
|
|
223
|
+
parser.add_argument(
|
|
224
|
+
"--format",
|
|
225
|
+
action="store_true",
|
|
226
|
+
help="Format output with dialog-aware paragraph grouping (implies --unwrap)",
|
|
227
|
+
)
|
|
228
|
+
args = parser.parse_args()
|
|
229
|
+
|
|
230
|
+
# --format implies --unwrap
|
|
231
|
+
unwrap = args.unwrap or args.format
|
|
232
|
+
|
|
233
|
+
# Validate arguments
|
|
234
|
+
if not args.input_file and not args.input_dir:
|
|
235
|
+
print(f" {YELLOW}Error:{RESET} Either --input-file or --input-dir is required")
|
|
236
|
+
sys.exit(1)
|
|
237
|
+
if args.input_file and args.input_dir:
|
|
238
|
+
print(f" {YELLOW}Error:{RESET} Cannot specify both --input-file and --input-dir")
|
|
239
|
+
sys.exit(1)
|
|
240
|
+
if args.input_dir and args.output_file:
|
|
241
|
+
print(f" {YELLOW}Error:{RESET} --output-file cannot be used with --input-dir")
|
|
242
|
+
sys.exit(1)
|
|
243
|
+
|
|
244
|
+
normalize = not args.no_normalize_quotes
|
|
245
|
+
|
|
246
|
+
# Process directory
|
|
247
|
+
if args.input_dir:
|
|
248
|
+
input_dir = os.path.expanduser(args.input_dir)
|
|
249
|
+
if not os.path.isdir(input_dir):
|
|
250
|
+
print(f" {YELLOW}Error:{RESET} Directory not found: {input_dir}")
|
|
251
|
+
sys.exit(1)
|
|
252
|
+
|
|
253
|
+
# Find all .txt files
|
|
254
|
+
txt_files = sorted([
|
|
255
|
+
f for f in os.listdir(input_dir)
|
|
256
|
+
if f.endswith(".txt") and not f.endswith("-clean.txt")
|
|
257
|
+
])
|
|
258
|
+
|
|
259
|
+
if not txt_files:
|
|
260
|
+
print(f" {YELLOW}Error:{RESET} No .txt files found in {input_dir}")
|
|
261
|
+
sys.exit(1)
|
|
262
|
+
|
|
263
|
+
_header("segment-file (batch)")
|
|
264
|
+
print(f" {DIM}Processing {len(txt_files)} files in directory{RESET}")
|
|
265
|
+
print()
|
|
266
|
+
_param("Directory", input_dir)
|
|
267
|
+
_param("Files", str(len(txt_files)))
|
|
268
|
+
_param("Unwrap", "enabled" if unwrap else "disabled")
|
|
269
|
+
_param("Normalize quotes", "disabled" if not normalize else "enabled")
|
|
270
|
+
_param("Format", "dialog" if args.format else "default (one sentence per line)")
|
|
271
|
+
print()
|
|
272
|
+
|
|
273
|
+
format_value = "dialog" if args.format else None
|
|
274
|
+
for i, filename in enumerate(txt_files, 1):
|
|
275
|
+
input_path = os.path.join(input_dir, filename)
|
|
276
|
+
output_path = _generate_output_path(input_path)
|
|
277
|
+
print(f" {BOLD}[{i}/{len(txt_files)}]{RESET} {filename}")
|
|
278
|
+
_process_single_file(input_path, output_path, unwrap, normalize, format_value)
|
|
279
|
+
print()
|
|
280
|
+
|
|
281
|
+
print(f" {GREEN}Done! Processed {len(txt_files)} files.{RESET}")
|
|
282
|
+
print()
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
# Process single file
|
|
286
|
+
input_file = os.path.expanduser(args.input_file)
|
|
287
|
+
if not os.path.isfile(input_file):
|
|
288
|
+
print(f" {YELLOW}Error:{RESET} File not found: {input_file}")
|
|
289
|
+
sys.exit(1)
|
|
290
|
+
|
|
291
|
+
output_file = args.output_file or _generate_output_path(input_file)
|
|
292
|
+
output_file = os.path.expanduser(output_file)
|
|
293
|
+
|
|
294
|
+
_header("segment-file")
|
|
295
|
+
print(f" {DIM}Segmenting text file into sentences{RESET}")
|
|
296
|
+
print()
|
|
297
|
+
|
|
298
|
+
format_value = "dialog" if args.format else None
|
|
299
|
+
_process_single_file(input_file, output_file, unwrap, normalize, format_value)
|
|
300
|
+
|
|
301
|
+
print(f"\n {GREEN}Done!{RESET}")
|
|
302
|
+
print()
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
if __name__ == "__main__":
|
|
306
|
+
main()
|
{fast_sentence_segment-1.4.4 → fast_sentence_segment-1.5.3}/fast_sentence_segment/dmo/__init__.py
RENAMED
|
@@ -13,3 +13,5 @@ from .unwrap_hard_wrapped_text import unwrap_hard_wrapped_text
|
|
|
13
13
|
from .normalize_quotes import normalize_quotes
|
|
14
14
|
from .group_quoted_sentences import group_quoted_sentences, format_grouped_sentences
|
|
15
15
|
from .strip_trailing_period_after_quote import StripTrailingPeriodAfterQuote
|
|
16
|
+
from .ocr_artifact_fixer import OcrArtifactFixer
|
|
17
|
+
from .dialog_formatter import DialogFormatter, format_dialog
|