complex-text-tools 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/CHANGELOG.md +15 -0
- {complex_text_tools-0.2.2/complex_text_tools.egg-info → complex_text_tools-0.2.4}/PKG-INFO +13 -1
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/README.md +12 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools/__init__.py +3 -3
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools/text_processor.py +30 -1
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4/complex_text_tools.egg-info}/PKG-INFO +13 -1
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/LICENSE +0 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/MANIFEST.in +0 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools.egg-info/SOURCES.txt +0 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools.egg-info/dependency_links.txt +0 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools.egg-info/requires.txt +0 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools.egg-info/top_level.txt +0 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/pyproject.toml +0 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/setup.cfg +0 -0
- {complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/setup.py +0 -0
|
@@ -5,6 +5,21 @@
|
|
|
5
5
|
格式基于 [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
项目遵循 [语义化版本控制](https://semver.org/spec/v2.0.0.html) 规范。
|
|
7
7
|
|
|
8
|
+
## [0.2.4] - 2026-01-21
|
|
9
|
+
|
|
10
|
+
### 修复
|
|
11
|
+
- 修复 `fix_punctuation` 函数以正确处理标点符号一边是中文的情况
|
|
12
|
+
- 现在只要标点符号的任意一边是中文,就会转换为中文标点
|
|
13
|
+
- 解决了如 "1.3%,且" 这种中英文混合场景下标点符号未转换的问题
|
|
14
|
+
|
|
15
|
+
## [0.2.3] - 2026-01-20
|
|
16
|
+
|
|
17
|
+
### 新增
|
|
18
|
+
- 添加 `fix_punctuation` 函数,用于修复中文文本中的标点符号
|
|
19
|
+
- 将英文标点转换为中文标点(如 `,` → `,`, `.` → `。`)
|
|
20
|
+
- 自动处理中文文本中的标点符号
|
|
21
|
+
- 移除重复的标点符号
|
|
22
|
+
|
|
8
23
|
## [0.2.2] - 2025-09-22
|
|
9
24
|
|
|
10
25
|
### 修复
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: complex-text-tools
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A package for processing complex text with mixed Chinese and English characters
|
|
5
5
|
Home-page: https://github.com/mooremok/complex-text-tools
|
|
6
6
|
Author: mooremok
|
|
@@ -38,6 +38,7 @@ Requires-Dist: pytest>=6.0; extra == "dev"
|
|
|
38
38
|
- 移除中英文字符之间的多余空格
|
|
39
39
|
- 正确处理标点符号周围的间距
|
|
40
40
|
- 根据特定规则计算文本长度(中文字符、英文单词、数字、等式等)
|
|
41
|
+
- 修复中文文本中的标点符号(将英文标点转换为中文标点)
|
|
41
42
|
- 高效处理混合语言文本
|
|
42
43
|
|
|
43
44
|
## 安装
|
|
@@ -70,6 +71,17 @@ print(result)
|
|
|
70
71
|
# 输出:15
|
|
71
72
|
```
|
|
72
73
|
|
|
74
|
+
### 修复标点符号
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from complex_text_tools import fix_punctuation
|
|
78
|
+
|
|
79
|
+
text = "这是中文文本,但使用了英文标点.这看起来不太自然,对吗?"
|
|
80
|
+
fixed_text = fix_punctuation(text)
|
|
81
|
+
print(fixed_text)
|
|
82
|
+
# 输出: "这是中文文本,但使用了中文标点。这看起来不太自然,对吗?"
|
|
83
|
+
```
|
|
84
|
+
|
|
73
85
|
## 许可证
|
|
74
86
|
|
|
75
87
|
该项目基于 MIT 许可证 - 详情请见 [LICENSE](LICENSE) 文件。
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
- 移除中英文字符之间的多余空格
|
|
13
13
|
- 正确处理标点符号周围的间距
|
|
14
14
|
- 根据特定规则计算文本长度(中文字符、英文单词、数字、等式等)
|
|
15
|
+
- 修复中文文本中的标点符号(将英文标点转换为中文标点)
|
|
15
16
|
- 高效处理混合语言文本
|
|
16
17
|
|
|
17
18
|
## 安装
|
|
@@ -44,6 +45,17 @@ print(result)
|
|
|
44
45
|
# 输出:15
|
|
45
46
|
```
|
|
46
47
|
|
|
48
|
+
### 修复标点符号
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from complex_text_tools import fix_punctuation
|
|
52
|
+
|
|
53
|
+
text = "这是中文文本,但使用了英文标点.这看起来不太自然,对吗?"
|
|
54
|
+
fixed_text = fix_punctuation(text)
|
|
55
|
+
print(fixed_text)
|
|
56
|
+
# 输出: "这是中文文本,但使用了中文标点。这看起来不太自然,对吗?"
|
|
57
|
+
```
|
|
58
|
+
|
|
47
59
|
## 许可证
|
|
48
60
|
|
|
49
61
|
该项目基于 MIT 许可证 - 详情请见 [LICENSE](LICENSE) 文件。
|
|
@@ -6,7 +6,7 @@ This package provides utilities for:
|
|
|
6
6
|
2. Counting effective text length according to specific rules
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
from .text_processor import remove_extra_spaces, count_eff_len
|
|
9
|
+
from .text_processor import remove_extra_spaces, count_eff_len, fix_punctuation
|
|
10
10
|
|
|
11
|
-
__all__ = ['remove_extra_spaces', 'count_eff_len']
|
|
12
|
-
__version__ = '0.2.
|
|
11
|
+
__all__ = ['remove_extra_spaces', 'count_eff_len', 'fix_punctuation']
|
|
12
|
+
__version__ = '0.2.4'
|
|
@@ -101,4 +101,33 @@ def count_eff_len(text: str) -> int:
|
|
|
101
101
|
punctuation = re.findall(r"[^\w\s]", remaining_text)
|
|
102
102
|
count += len(punctuation)
|
|
103
103
|
|
|
104
|
-
return count
|
|
104
|
+
return count
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def fix_punctuation(text):
|
|
108
|
+
"""修复文本的标点符号"""
|
|
109
|
+
def replace_punc(match):
|
|
110
|
+
punc = match.group()
|
|
111
|
+
return {
|
|
112
|
+
',': ',',
|
|
113
|
+
'.': '。',
|
|
114
|
+
';': ';',
|
|
115
|
+
':': ':',
|
|
116
|
+
'?': '?',
|
|
117
|
+
'!': '!',
|
|
118
|
+
'(': '(',
|
|
119
|
+
')': ')',
|
|
120
|
+
}.get(punc, punc)
|
|
121
|
+
|
|
122
|
+
pattern = r'(?<=[\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])[,.;:?!()]|[,.;:?!()](?=[\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])'
|
|
123
|
+
text = re.sub(pattern, replace_punc, text)
|
|
124
|
+
|
|
125
|
+
text = re.sub(r',+', ',', text)
|
|
126
|
+
text = re.sub(r'。+', '。', text)
|
|
127
|
+
|
|
128
|
+
return text
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
text = "成人CHF患者患病率已达1.3%,且患病后5年生存率仅约50%,严重威胁了患者生命健康[2]。"
|
|
133
|
+
print(fix_punctuation(text))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: complex-text-tools
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A package for processing complex text with mixed Chinese and English characters
|
|
5
5
|
Home-page: https://github.com/mooremok/complex-text-tools
|
|
6
6
|
Author: mooremok
|
|
@@ -38,6 +38,7 @@ Requires-Dist: pytest>=6.0; extra == "dev"
|
|
|
38
38
|
- 移除中英文字符之间的多余空格
|
|
39
39
|
- 正确处理标点符号周围的间距
|
|
40
40
|
- 根据特定规则计算文本长度(中文字符、英文单词、数字、等式等)
|
|
41
|
+
- 修复中文文本中的标点符号(将英文标点转换为中文标点)
|
|
41
42
|
- 高效处理混合语言文本
|
|
42
43
|
|
|
43
44
|
## 安装
|
|
@@ -70,6 +71,17 @@ print(result)
|
|
|
70
71
|
# 输出:15
|
|
71
72
|
```
|
|
72
73
|
|
|
74
|
+
### 修复标点符号
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from complex_text_tools import fix_punctuation
|
|
78
|
+
|
|
79
|
+
text = "这是中文文本,但使用了英文标点.这看起来不太自然,对吗?"
|
|
80
|
+
fixed_text = fix_punctuation(text)
|
|
81
|
+
print(fixed_text)
|
|
82
|
+
# 输出: "这是中文文本,但使用了中文标点。这看起来不太自然,对吗?"
|
|
83
|
+
```
|
|
84
|
+
|
|
73
85
|
## 许可证
|
|
74
86
|
|
|
75
87
|
该项目基于 MIT 许可证 - 详情请见 [LICENSE](LICENSE) 文件。
|
|
File without changes
|
|
File without changes
|
{complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools.egg-info/requires.txt
RENAMED
|
File without changes
|
{complex_text_tools-0.2.2 → complex_text_tools-0.2.4}/complex_text_tools.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|