fast-sentence-segment 1.4.2__py3-none-any.whl → 1.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_sentence_segment/cli.py +31 -15
- fast_sentence_segment/dmo/spacy_doc_segmenter.py +51 -0
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/METADATA +3 -3
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/RECORD +7 -7
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/WHEEL +1 -1
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/entry_points.txt +0 -0
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info/licenses}/LICENSE +0 -0
fast_sentence_segment/cli.py
CHANGED
|
@@ -143,34 +143,50 @@ def file_main():
|
|
|
143
143
|
)
|
|
144
144
|
args = parser.parse_args()
|
|
145
145
|
|
|
146
|
+
# Echo command immediately
|
|
146
147
|
_header("segment-file")
|
|
148
|
+
print(f" {DIM}Segmenting text file into sentences{RESET}")
|
|
149
|
+
print()
|
|
150
|
+
|
|
151
|
+
# Show configuration
|
|
147
152
|
_param("Input", args.input_file)
|
|
148
153
|
_param("Output", args.output_file)
|
|
149
154
|
_param("Size", _file_size(args.input_file))
|
|
150
155
|
_param("Unwrap", "enabled" if args.unwrap else "disabled")
|
|
151
156
|
_param("Normalize quotes", "disabled" if args.no_normalize_quotes else "enabled")
|
|
157
|
+
print()
|
|
152
158
|
|
|
159
|
+
# Step 1: Read file
|
|
160
|
+
print(f" {YELLOW}→{RESET} Reading input file...")
|
|
153
161
|
with open(args.input_file, "r", encoding="utf-8") as f:
|
|
154
162
|
text = f.read()
|
|
163
|
+
print(f" {GREEN}✓{RESET} Read {len(text):,} characters")
|
|
155
164
|
|
|
165
|
+
# Step 2: Segment text
|
|
166
|
+
print(f" {YELLOW}→{RESET} Segmenting text...", end="", flush=True)
|
|
156
167
|
start = time.perf_counter()
|
|
157
168
|
normalize = not args.no_normalize_quotes
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
)
|
|
169
|
+
sentences = segment_text(
|
|
170
|
+
text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
|
|
171
|
+
)
|
|
162
172
|
elapsed = time.perf_counter() - start
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
173
|
+
print(f"\r {GREEN}✓{RESET} Segmented into {len(sentences):,} sentences ({elapsed:.2f}s)")
|
|
174
|
+
|
|
175
|
+
# Step 3: Write output
|
|
176
|
+
total = len(sentences)
|
|
177
|
+
with open(args.output_file, "w", encoding="utf-8") as f:
|
|
178
|
+
if args.unwrap:
|
|
179
|
+
f.write(format_grouped_sentences(sentences) + "\n")
|
|
180
|
+
print(f" {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file}")
|
|
181
|
+
else:
|
|
182
|
+
for i, sentence in enumerate(sentences, 1):
|
|
183
|
+
f.write(sentence + "\n")
|
|
184
|
+
if i % 500 == 0 or i == total:
|
|
185
|
+
pct = (i / total) * 100
|
|
186
|
+
print(f"\r {YELLOW}→{RESET} Writing... {pct:.0f}% ({i:,}/{total:,})", end="", flush=True)
|
|
187
|
+
print(f"\r {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file} ")
|
|
188
|
+
|
|
189
|
+
print(f"\n {GREEN}Done!{RESET}")
|
|
174
190
|
print()
|
|
175
191
|
|
|
176
192
|
|
|
@@ -65,6 +65,54 @@ class SpacyDocSegmenter(BaseObject):
|
|
|
65
65
|
return False
|
|
66
66
|
return True
|
|
67
67
|
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _merge_orphaned_quotes(sentences: list) -> list:
|
|
70
|
+
"""Merge orphaned opening quotes with the following sentence.
|
|
71
|
+
|
|
72
|
+
spaCy sometimes splits on opening quotes, producing sentences like:
|
|
73
|
+
["'", "Oh, the funeral..."]
|
|
74
|
+
This merges them into:
|
|
75
|
+
["'Oh, the funeral..."]
|
|
76
|
+
|
|
77
|
+
Also handles trailing orphaned quotes that should belong to next sentence:
|
|
78
|
+
["He said. '", "Hello!'"]
|
|
79
|
+
Becomes:
|
|
80
|
+
["He said.", "'Hello!'"]
|
|
81
|
+
"""
|
|
82
|
+
if not sentences:
|
|
83
|
+
return sentences
|
|
84
|
+
|
|
85
|
+
result = []
|
|
86
|
+
i = 0
|
|
87
|
+
while i < len(sentences):
|
|
88
|
+
sent = sentences[i]
|
|
89
|
+
# Check if this sentence is just an opening quote
|
|
90
|
+
if sent.strip() in ("'", '"', "'.", '".'):
|
|
91
|
+
# Merge with the next sentence if available
|
|
92
|
+
if i + 1 < len(sentences):
|
|
93
|
+
quote_char = sent.strip().rstrip('.')
|
|
94
|
+
result.append(quote_char + sentences[i + 1])
|
|
95
|
+
i += 2
|
|
96
|
+
continue
|
|
97
|
+
result.append(sent)
|
|
98
|
+
i += 1
|
|
99
|
+
|
|
100
|
+
# Second pass: handle trailing orphaned quotes
|
|
101
|
+
# Pattern: sentence ends with `. '` or `. "` - move quote to next sentence
|
|
102
|
+
fixed = []
|
|
103
|
+
for i, sent in enumerate(result):
|
|
104
|
+
# Check for trailing orphaned quote (`. '` or `? '` or `! '`)
|
|
105
|
+
if len(sent) >= 3 and sent[-2:] in (" '", ' "') and sent[-3] in '.?!':
|
|
106
|
+
# Strip the trailing quote
|
|
107
|
+
trailing_quote = sent[-1]
|
|
108
|
+
sent = sent[:-2]
|
|
109
|
+
# Prepend to next sentence if available
|
|
110
|
+
if i + 1 < len(result) and not result[i + 1].startswith(('"', "'")):
|
|
111
|
+
result[i + 1] = trailing_quote + result[i + 1]
|
|
112
|
+
fixed.append(sent)
|
|
113
|
+
|
|
114
|
+
return fixed
|
|
115
|
+
|
|
68
116
|
@staticmethod
|
|
69
117
|
def _cleanse(sentences: list) -> str:
|
|
70
118
|
sentences = [sent for sent in sentences
|
|
@@ -103,6 +151,9 @@ class SpacyDocSegmenter(BaseObject):
|
|
|
103
151
|
sentences = [sent for sent in sentences if
|
|
104
152
|
sent and len(sent) and sent != 'None']
|
|
105
153
|
|
|
154
|
+
# Merge orphaned opening quotes with following sentence
|
|
155
|
+
sentences = self._merge_orphaned_quotes(sentences)
|
|
156
|
+
|
|
106
157
|
sentences = [self._append_period(sent)
|
|
107
158
|
for sent in sentences]
|
|
108
159
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: fast-sentence-segment
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.4
|
|
4
4
|
Summary: Fast and Efficient Sentence Segmentation
|
|
5
|
-
Home-page: https://github.com/craigtrim/fast-sentence-segment
|
|
6
5
|
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
7
|
Keywords: nlp,text,preprocess,segment
|
|
8
8
|
Author: Craig Trim
|
|
9
9
|
Author-email: craigtrim@gmail.com
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
fast_sentence_segment/__init__.py,sha256=jeb4yCy89ivyqbo-4ldJLquPAG_XR_33Q7nrDjqPxvE,1465
|
|
2
2
|
fast_sentence_segment/bp/__init__.py,sha256=j2-WfQ9WwVuXeGSjvV6XLVwEdvau8sdAQe4Pa4DrYi8,33
|
|
3
3
|
fast_sentence_segment/bp/segmenter.py,sha256=UW6DguPgA56h-pPYRsfJhjIzBe40j6NdjkwYxamASyA,1928
|
|
4
|
-
fast_sentence_segment/cli.py,sha256=
|
|
4
|
+
fast_sentence_segment/cli.py,sha256=vr1Gh-pq4bIPcnhUF6c7ckGdEfoyrI_r0XcrJrIfjEA,5640
|
|
5
5
|
fast_sentence_segment/core/__init__.py,sha256=uoBersYyVStJ5a8zJpQz1GDGaloEdAv2jGHw1292hRM,108
|
|
6
6
|
fast_sentence_segment/core/base_object.py,sha256=AYr7yzusIwawjbKdvcv4yTEnhmx6M583kDZzhzPOmq4,635
|
|
7
7
|
fast_sentence_segment/core/stopwatch.py,sha256=hE6hMz2q6rduaKi58KZmiAL-lRtyh_wWCANhl4KLkRQ,879
|
|
@@ -18,15 +18,15 @@ fast_sentence_segment/dmo/normalize_quotes.py,sha256=mr53qo_tj_I9XzElOKjUQvCtDQh
|
|
|
18
18
|
fast_sentence_segment/dmo/numbered_list_normalizer.py,sha256=q0sOCW8Jkn2vTXlUcVhmDvYES3yvJx1oUVl_8y7eL4E,1672
|
|
19
19
|
fast_sentence_segment/dmo/post_process_sentences.py,sha256=5jxG3TmFjxIExMPLhnCB5JT1lXQvFU9r4qQGoATGrWk,916
|
|
20
20
|
fast_sentence_segment/dmo/question_exclamation_splitter.py,sha256=cRsWRu8zb6wOWG-BjMahHfz4YGutKiV9lW7dE-q3tgc,2006
|
|
21
|
-
fast_sentence_segment/dmo/spacy_doc_segmenter.py,sha256=
|
|
21
|
+
fast_sentence_segment/dmo/spacy_doc_segmenter.py,sha256=Kb65TYMhrbpTYEey5vb7TyhCjUHVxmugHYIeKkntCwk,5147
|
|
22
22
|
fast_sentence_segment/dmo/strip_trailing_period_after_quote.py,sha256=wYkoLy5XJKZIblJXBvDAB8-a81UTQOhOf2u91wjJWUw,2259
|
|
23
23
|
fast_sentence_segment/dmo/title_name_merger.py,sha256=zbG04_VjwM8TtT8LhavvmZqIZL_2xgT2OTxWkK_Zt1s,5133
|
|
24
24
|
fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWvv6rb2mny8pnVd428oVZL0n5I,2457
|
|
25
25
|
fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
|
|
26
26
|
fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
|
|
27
27
|
fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
|
|
28
|
-
fast_sentence_segment-1.4.
|
|
29
|
-
fast_sentence_segment-1.4.
|
|
30
|
-
fast_sentence_segment-1.4.
|
|
31
|
-
fast_sentence_segment-1.4.
|
|
32
|
-
fast_sentence_segment-1.4.
|
|
28
|
+
fast_sentence_segment-1.4.4.dist-info/METADATA,sha256=8EZrIvdE8CWTW64_HqkMH8eF6CzXs6UDdaFjcce7LTA,7947
|
|
29
|
+
fast_sentence_segment-1.4.4.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
|
|
30
|
+
fast_sentence_segment-1.4.4.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
|
|
31
|
+
fast_sentence_segment-1.4.4.dist-info/licenses/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
|
|
32
|
+
fast_sentence_segment-1.4.4.dist-info/RECORD,,
|
{fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.4.dist-info/licenses}/LICENSE
RENAMED
|
File without changes
|