fast-sentence-segment 1.4.2__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_sentence_segment/cli.py +31 -15
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.3.dist-info}/METADATA +1 -2
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.3.dist-info}/RECORD +6 -6
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.3.dist-info}/LICENSE +0 -0
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.3.dist-info}/WHEEL +0 -0
- {fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.3.dist-info}/entry_points.txt +0 -0
fast_sentence_segment/cli.py
CHANGED
|
@@ -143,34 +143,50 @@ def file_main():
|
|
|
143
143
|
)
|
|
144
144
|
args = parser.parse_args()
|
|
145
145
|
|
|
146
|
+
# Echo command immediately
|
|
146
147
|
_header("segment-file")
|
|
148
|
+
print(f" {DIM}Segmenting text file into sentences{RESET}")
|
|
149
|
+
print()
|
|
150
|
+
|
|
151
|
+
# Show configuration
|
|
147
152
|
_param("Input", args.input_file)
|
|
148
153
|
_param("Output", args.output_file)
|
|
149
154
|
_param("Size", _file_size(args.input_file))
|
|
150
155
|
_param("Unwrap", "enabled" if args.unwrap else "disabled")
|
|
151
156
|
_param("Normalize quotes", "disabled" if args.no_normalize_quotes else "enabled")
|
|
157
|
+
print()
|
|
152
158
|
|
|
159
|
+
# Step 1: Read file
|
|
160
|
+
print(f" {YELLOW}→{RESET} Reading input file...")
|
|
153
161
|
with open(args.input_file, "r", encoding="utf-8") as f:
|
|
154
162
|
text = f.read()
|
|
163
|
+
print(f" {GREEN}✓{RESET} Read {len(text):,} characters")
|
|
155
164
|
|
|
165
|
+
# Step 2: Segment text
|
|
166
|
+
print(f" {YELLOW}→{RESET} Segmenting text...", end="", flush=True)
|
|
156
167
|
start = time.perf_counter()
|
|
157
168
|
normalize = not args.no_normalize_quotes
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
)
|
|
169
|
+
sentences = segment_text(
|
|
170
|
+
text.strip(), flatten=True, unwrap=args.unwrap, normalize=normalize,
|
|
171
|
+
)
|
|
162
172
|
elapsed = time.perf_counter() - start
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
173
|
+
print(f"\r {GREEN}✓{RESET} Segmented into {len(sentences):,} sentences ({elapsed:.2f}s)")
|
|
174
|
+
|
|
175
|
+
# Step 3: Write output
|
|
176
|
+
total = len(sentences)
|
|
177
|
+
with open(args.output_file, "w", encoding="utf-8") as f:
|
|
178
|
+
if args.unwrap:
|
|
179
|
+
f.write(format_grouped_sentences(sentences) + "\n")
|
|
180
|
+
print(f" {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file}")
|
|
181
|
+
else:
|
|
182
|
+
for i, sentence in enumerate(sentences, 1):
|
|
183
|
+
f.write(sentence + "\n")
|
|
184
|
+
if i % 500 == 0 or i == total:
|
|
185
|
+
pct = (i / total) * 100
|
|
186
|
+
print(f"\r {YELLOW}→{RESET} Writing... {pct:.0f}% ({i:,}/{total:,})", end="", flush=True)
|
|
187
|
+
print(f"\r {GREEN}✓{RESET} Written {total:,} sentences to {args.output_file} ")
|
|
188
|
+
|
|
189
|
+
print(f"\n {GREEN}Done!{RESET}")
|
|
174
190
|
print()
|
|
175
191
|
|
|
176
192
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fast-sentence-segment
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.3
|
|
4
4
|
Summary: Fast and Efficient Sentence Segmentation
|
|
5
5
|
Home-page: https://github.com/craigtrim/fast-sentence-segment
|
|
6
6
|
License: MIT
|
|
@@ -33,7 +33,6 @@ Description-Content-Type: text/markdown
|
|
|
33
33
|
|
|
34
34
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
35
35
|
[](https://pypi.org/project/fast-sentence-segment/)
|
|
36
|
-
[](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
|
|
37
36
|
[](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
|
|
38
37
|
[](https://opensource.org/licenses/MIT)
|
|
39
38
|
[](https://github.com/astral-sh/ruff)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
fast_sentence_segment/__init__.py,sha256=jeb4yCy89ivyqbo-4ldJLquPAG_XR_33Q7nrDjqPxvE,1465
|
|
2
2
|
fast_sentence_segment/bp/__init__.py,sha256=j2-WfQ9WwVuXeGSjvV6XLVwEdvau8sdAQe4Pa4DrYi8,33
|
|
3
3
|
fast_sentence_segment/bp/segmenter.py,sha256=UW6DguPgA56h-pPYRsfJhjIzBe40j6NdjkwYxamASyA,1928
|
|
4
|
-
fast_sentence_segment/cli.py,sha256=
|
|
4
|
+
fast_sentence_segment/cli.py,sha256=vr1Gh-pq4bIPcnhUF6c7ckGdEfoyrI_r0XcrJrIfjEA,5640
|
|
5
5
|
fast_sentence_segment/core/__init__.py,sha256=uoBersYyVStJ5a8zJpQz1GDGaloEdAv2jGHw1292hRM,108
|
|
6
6
|
fast_sentence_segment/core/base_object.py,sha256=AYr7yzusIwawjbKdvcv4yTEnhmx6M583kDZzhzPOmq4,635
|
|
7
7
|
fast_sentence_segment/core/stopwatch.py,sha256=hE6hMz2q6rduaKi58KZmiAL-lRtyh_wWCANhl4KLkRQ,879
|
|
@@ -25,8 +25,8 @@ fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWv
|
|
|
25
25
|
fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
|
|
26
26
|
fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
|
|
27
27
|
fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
|
|
28
|
-
fast_sentence_segment-1.4.
|
|
29
|
-
fast_sentence_segment-1.4.
|
|
30
|
-
fast_sentence_segment-1.4.
|
|
31
|
-
fast_sentence_segment-1.4.
|
|
32
|
-
fast_sentence_segment-1.4.
|
|
28
|
+
fast_sentence_segment-1.4.3.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
|
|
29
|
+
fast_sentence_segment-1.4.3.dist-info/METADATA,sha256=5LGK9z9ip2AtOr2FgaIgkrR2mLvIQaeeuh8gVi3GBaA,7785
|
|
30
|
+
fast_sentence_segment-1.4.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
31
|
+
fast_sentence_segment-1.4.3.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
|
|
32
|
+
fast_sentence_segment-1.4.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{fast_sentence_segment-1.4.2.dist-info → fast_sentence_segment-1.4.3.dist-info}/entry_points.txt
RENAMED
|
File without changes
|