fast-sentence-segment 1.4.3__py3-none-any.whl → 1.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,6 +65,54 @@ class SpacyDocSegmenter(BaseObject):
65
65
  return False
66
66
  return True
67
67
 
68
+ @staticmethod
69
+ def _merge_orphaned_quotes(sentences: list) -> list:
70
+ """Merge orphaned opening quotes with the following sentence.
71
+
72
+ spaCy sometimes splits on opening quotes, producing sentences like:
73
+ ["'", "Oh, the funeral..."]
74
+ This merges them into:
75
+ ["'Oh, the funeral..."]
76
+
77
+ Also handles trailing orphaned quotes that should belong to next sentence:
78
+ ["He said. '", "Hello!'"]
79
+ Becomes:
80
+ ["He said.", "'Hello!'"]
81
+ """
82
+ if not sentences:
83
+ return sentences
84
+
85
+ result = []
86
+ i = 0
87
+ while i < len(sentences):
88
+ sent = sentences[i]
89
+ # Check if this sentence is just an opening quote
90
+ if sent.strip() in ("'", '"', "'.", '".'):
91
+ # Merge with the next sentence if available
92
+ if i + 1 < len(sentences):
93
+ quote_char = sent.strip().rstrip('.')
94
+ result.append(quote_char + sentences[i + 1])
95
+ i += 2
96
+ continue
97
+ result.append(sent)
98
+ i += 1
99
+
100
+ # Second pass: handle trailing orphaned quotes
101
+ # Pattern: sentence ends with `. '` or `. "` - move quote to next sentence
102
+ fixed = []
103
+ for i, sent in enumerate(result):
104
+ # Check for trailing orphaned quote (`. '` or `? '` or `! '`)
105
+ if len(sent) >= 3 and sent[-2:] in (" '", ' "') and sent[-3] in '.?!':
106
+ # Strip the trailing quote
107
+ trailing_quote = sent[-1]
108
+ sent = sent[:-2]
109
+ # Prepend to next sentence if available
110
+ if i + 1 < len(result) and not result[i + 1].startswith(('"', "'")):
111
+ result[i + 1] = trailing_quote + result[i + 1]
112
+ fixed.append(sent)
113
+
114
+ return fixed
115
+
68
116
  @staticmethod
69
117
  def _cleanse(sentences: list) -> str:
70
118
  sentences = [sent for sent in sentences
@@ -103,6 +151,9 @@ class SpacyDocSegmenter(BaseObject):
103
151
  sentences = [sent for sent in sentences if
104
152
  sent and len(sent) and sent != 'None']
105
153
 
154
+ # Merge orphaned opening quotes with following sentence
155
+ sentences = self._merge_orphaned_quotes(sentences)
156
+
106
157
  sentences = [self._append_period(sent)
107
158
  for sent in sentences]
108
159
 
@@ -1,9 +1,9 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: fast-sentence-segment
3
- Version: 1.4.3
3
+ Version: 1.4.4
4
4
  Summary: Fast and Efficient Sentence Segmentation
5
- Home-page: https://github.com/craigtrim/fast-sentence-segment
6
5
  License: MIT
6
+ License-File: LICENSE
7
7
  Keywords: nlp,text,preprocess,segment
8
8
  Author: Craig Trim
9
9
  Author-email: craigtrim@gmail.com
@@ -33,6 +33,7 @@ Description-Content-Type: text/markdown
33
33
 
34
34
  [![PyPI version](https://img.shields.io/pypi/v/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
35
35
  [![Python versions](https://img.shields.io/pypi/pyversions/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
36
+ [![CI](https://img.shields.io/github/actions/workflow/status/craigtrim/fast-sentence-segment/ci.yml?branch=master&label=CI)](https://github.com/craigtrim/fast-sentence-segment/actions/workflows/ci.yml)
36
37
  [![Tests](https://img.shields.io/badge/tests-664-brightgreen)](https://github.com/craigtrim/fast-sentence-segment/tree/master/tests)
37
38
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
38
39
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
@@ -18,15 +18,15 @@ fast_sentence_segment/dmo/normalize_quotes.py,sha256=mr53qo_tj_I9XzElOKjUQvCtDQh
18
18
  fast_sentence_segment/dmo/numbered_list_normalizer.py,sha256=q0sOCW8Jkn2vTXlUcVhmDvYES3yvJx1oUVl_8y7eL4E,1672
19
19
  fast_sentence_segment/dmo/post_process_sentences.py,sha256=5jxG3TmFjxIExMPLhnCB5JT1lXQvFU9r4qQGoATGrWk,916
20
20
  fast_sentence_segment/dmo/question_exclamation_splitter.py,sha256=cRsWRu8zb6wOWG-BjMahHfz4YGutKiV9lW7dE-q3tgc,2006
21
- fast_sentence_segment/dmo/spacy_doc_segmenter.py,sha256=_oTsrIL2rjysjt_8bPJVNTn230pUtL-geCC8g174iC4,3163
21
+ fast_sentence_segment/dmo/spacy_doc_segmenter.py,sha256=Kb65TYMhrbpTYEey5vb7TyhCjUHVxmugHYIeKkntCwk,5147
22
22
  fast_sentence_segment/dmo/strip_trailing_period_after_quote.py,sha256=wYkoLy5XJKZIblJXBvDAB8-a81UTQOhOf2u91wjJWUw,2259
23
23
  fast_sentence_segment/dmo/title_name_merger.py,sha256=zbG04_VjwM8TtT8LhavvmZqIZL_2xgT2OTxWkK_Zt1s,5133
24
24
  fast_sentence_segment/dmo/unwrap_hard_wrapped_text.py,sha256=V1T5RsJBaII_iGJMyWvv6rb2mny8pnVd428oVZL0n5I,2457
25
25
  fast_sentence_segment/svc/__init__.py,sha256=9B12mXxBnlalH4OAm1AMLwUMa-RLi2ilv7qhqv26q7g,144
26
26
  fast_sentence_segment/svc/perform_paragraph_segmentation.py,sha256=zLKw9rSzb0NNfx4MyEeoGrHwhxTtH5oDrYcAL2LMVHY,1378
27
27
  fast_sentence_segment/svc/perform_sentence_segmentation.py,sha256=mAJEPWqNQFbnlj7Rb7yiXIRHCAdlgsN0jAbg7e2qpMU,7421
28
- fast_sentence_segment-1.4.3.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
29
- fast_sentence_segment-1.4.3.dist-info/METADATA,sha256=5LGK9z9ip2AtOr2FgaIgkrR2mLvIQaeeuh8gVi3GBaA,7785
30
- fast_sentence_segment-1.4.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
- fast_sentence_segment-1.4.3.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
32
- fast_sentence_segment-1.4.3.dist-info/RECORD,,
28
+ fast_sentence_segment-1.4.4.dist-info/METADATA,sha256=8EZrIvdE8CWTW64_HqkMH8eF6CzXs6UDdaFjcce7LTA,7947
29
+ fast_sentence_segment-1.4.4.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
30
+ fast_sentence_segment-1.4.4.dist-info/entry_points.txt,sha256=Zc8OwFKj3ofnjy5ZIFqHzDkIEWweV1AP1xap1ZFGD8M,107
31
+ fast_sentence_segment-1.4.4.dist-info/licenses/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
32
+ fast_sentence_segment-1.4.4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 2.3.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any