complex-text-tools 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- complex_text_tools-0.1.0/LICENSE +21 -0
- complex_text_tools-0.1.0/MANIFEST.in +2 -0
- complex_text_tools-0.1.0/PKG-INFO +54 -0
- complex_text_tools-0.1.0/README.md +31 -0
- complex_text_tools-0.1.0/complex_text_tools/__init__.py +3 -0
- complex_text_tools-0.1.0/complex_text_tools/test_processor.py +22 -0
- complex_text_tools-0.1.0/complex_text_tools/text_processor.py +49 -0
- complex_text_tools-0.1.0/complex_text_tools.egg-info/PKG-INFO +54 -0
- complex_text_tools-0.1.0/complex_text_tools.egg-info/SOURCES.txt +12 -0
- complex_text_tools-0.1.0/complex_text_tools.egg-info/dependency_links.txt +1 -0
- complex_text_tools-0.1.0/complex_text_tools.egg-info/top_level.txt +1 -0
- complex_text_tools-0.1.0/pyproject.toml +30 -0
- complex_text_tools-0.1.0/setup.cfg +4 -0
- complex_text_tools-0.1.0/setup.py +32 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Your Name
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: complex-text-tools
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package for processing complex text with mixed Chinese and English characters
|
|
5
|
+
Home-page: https://github.com/yourusername/complex-text-tools
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: Your Name <your.email@example.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/yourusername/complex-text-tools
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Requires-Python: >=3.6
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
|
|
24
|
+
# Complex Text Tools
|
|
25
|
+
|
|
26
|
+
A Python package for processing complex text containing mixed Chinese and English characters, removing extra spaces and standardizing punctuation.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- Remove extra spaces between Chinese characters
|
|
31
|
+
- Remove extra spaces between Chinese and English characters
|
|
32
|
+
- Handle spacing around punctuation marks correctly
|
|
33
|
+
- Process mixed language texts efficiently
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install complex-text-tools
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from complex_text_tools import remove_extra_spaces
|
|
45
|
+
|
|
46
|
+
text = "这 是 中文 测试 文本 , mixed English text here , 还 有 symbols : ; ! "
|
|
47
|
+
clean_text = remove_extra_spaces(text)
|
|
48
|
+
print(clean_text)
|
|
49
|
+
# Output: "这是中文测试文本,mixed English text here,还有 symbols:;!"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## License
|
|
53
|
+
|
|
54
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Complex Text Tools
|
|
2
|
+
|
|
3
|
+
A Python package for processing complex text containing mixed Chinese and English characters, removing extra spaces and standardizing punctuation.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Remove extra spaces between Chinese characters
|
|
8
|
+
- Remove extra spaces between Chinese and English characters
|
|
9
|
+
- Handle spacing around punctuation marks correctly
|
|
10
|
+
- Process mixed language texts efficiently
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install complex-text-tools
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from complex_text_tools import remove_extra_spaces
|
|
22
|
+
|
|
23
|
+
text = "这 是 中文 测试 文本 , mixed English text here , 还 有 symbols : ; ! "
|
|
24
|
+
clean_text = remove_extra_spaces(text)
|
|
25
|
+
print(clean_text)
|
|
26
|
+
# Output: "这是中文测试文本,mixed English text here,还有 symbols:;!"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## License
|
|
30
|
+
|
|
31
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
from .text_processor import remove_extra_spaces
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_remove_extra_spaces():
|
|
8
|
+
# 测试用例:验证各种空格移除功能
|
|
9
|
+
test_text = "这 是 中文 测试 文本 , mixed English text here , 还 有 symbols : ; ! "
|
|
10
|
+
expected_result = "这是中文测试文本,mixed English text here,还有 symbols:;!"
|
|
11
|
+
result = remove_extra_spaces(test_text)
|
|
12
|
+
|
|
13
|
+
print("原始文本:", test_text)
|
|
14
|
+
print("处理结果:", result)
|
|
15
|
+
print("预期结果:", expected_result)
|
|
16
|
+
print("测试通过:", result == expected_result)
|
|
17
|
+
|
|
18
|
+
return result == expected_result
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
if __name__ == "__main__":
|
|
22
|
+
test_remove_extra_spaces()
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def remove_extra_spaces(text: str) -> str:
|
|
7
|
+
"""
|
|
8
|
+
移除混合字符串中多余的空格
|
|
9
|
+
"""
|
|
10
|
+
# 移除中文字符(包括符号)之间的空格
|
|
11
|
+
pattern1 = r"(?<=[\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])\s+(?=[\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])"
|
|
12
|
+
|
|
13
|
+
# 移除中英文字符(包括符号)之间的空格
|
|
14
|
+
pattern2 = r"(?<=[\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])\s+(?=[a-zA-Z])|(?<=[a-zA-Z])\s+(?=[\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])"
|
|
15
|
+
|
|
16
|
+
# 移除中文字符与英文符号之间的空格(扩展支持更多符号)
|
|
17
|
+
pattern3 = r"(?<=[\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])\s+(?=[\[\]\(\)\{\}\"\'\:\;\?\!\,\.\`\~])|(?<=[\[\]\(\)\{\}\"\'\:\;\?\!\,\.\`\~])\s+(?=[\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])"
|
|
18
|
+
|
|
19
|
+
# 处理英文单词与标点之间的空格
|
|
20
|
+
# 单词和后面的标点应该是紧挨着的(无空格)
|
|
21
|
+
# 标点后面的单词之间应该有且仅有一个空格
|
|
22
|
+
pattern4a = r"(\w)\s+([^\w\s])" # 单词和标点之间(移除空格)
|
|
23
|
+
pattern4b = r"([^\w\s])\s+(\w)" # 标点和单词之间(保留一个空格)
|
|
24
|
+
pattern4c = r"(\w)\s{2,}(\w)" # 单词和单词之间多余的空格(替换为一个空格)
|
|
25
|
+
|
|
26
|
+
# 处理英文符号之间的多余空格(确保符号之间只有一个空格)
|
|
27
|
+
pattern5 = r"([^\w\s])\s{2,}([^\w\s])"
|
|
28
|
+
# 移除英文标点之间的空格
|
|
29
|
+
pattern7 = r"([^\w\s])\s+([^\w\s])"
|
|
30
|
+
# 处理英文符号和中文之间的空格问题
|
|
31
|
+
# pattern6 = r"([\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])([^\w\s])|([^\w\s])([\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])"
|
|
32
|
+
|
|
33
|
+
# 按顺序处理各种空格
|
|
34
|
+
# 先处理中英文之间的空格
|
|
35
|
+
text = re.sub(pattern2, "", text)
|
|
36
|
+
# 再处理中文之间的空格
|
|
37
|
+
text = re.sub(pattern1, "", text)
|
|
38
|
+
# 然后处理中文与英文符号之间的空格
|
|
39
|
+
text = re.sub(pattern3, "", text)
|
|
40
|
+
# 处理英文单词和标点之间的空格
|
|
41
|
+
text = re.sub(pattern4a, r"\1\2", text) # 单词和标点之间无空格
|
|
42
|
+
text = re.sub(pattern4b, r"\1 \2", text) # 标点和单词之间保留一个空格
|
|
43
|
+
text = re.sub(pattern4c, r"\1 \2", text) # 单词间多余空格替换为一个空格
|
|
44
|
+
# 处理英文符号之间的多余空格
|
|
45
|
+
text = re.sub(pattern5, r"\1\2", text)
|
|
46
|
+
# 处理英文符号和中文之间的空格(确保有空格)
|
|
47
|
+
text = re.sub(pattern7, r"\1\2", text)
|
|
48
|
+
|
|
49
|
+
return text
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: complex-text-tools
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package for processing complex text with mixed Chinese and English characters
|
|
5
|
+
Home-page: https://github.com/yourusername/complex-text-tools
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: Your Name <your.email@example.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/yourusername/complex-text-tools
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Requires-Python: >=3.6
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
|
|
24
|
+
# Complex Text Tools
|
|
25
|
+
|
|
26
|
+
A Python package for processing complex text containing mixed Chinese and English characters, removing extra spaces and standardizing punctuation.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- Remove extra spaces between Chinese characters
|
|
31
|
+
- Remove extra spaces between Chinese and English characters
|
|
32
|
+
- Handle spacing around punctuation marks correctly
|
|
33
|
+
- Process mixed language texts efficiently
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install complex-text-tools
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from complex_text_tools import remove_extra_spaces
|
|
45
|
+
|
|
46
|
+
text = "这 是 中文 测试 文本 , mixed English text here , 还 有 symbols : ; ! "
|
|
47
|
+
clean_text = remove_extra_spaces(text)
|
|
48
|
+
print(clean_text)
|
|
49
|
+
# Output: "这是中文测试文本,mixed English text here,还有 symbols:;!"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## License
|
|
53
|
+
|
|
54
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
setup.py
|
|
6
|
+
complex_text_tools/__init__.py
|
|
7
|
+
complex_text_tools/test_processor.py
|
|
8
|
+
complex_text_tools/text_processor.py
|
|
9
|
+
complex_text_tools.egg-info/PKG-INFO
|
|
10
|
+
complex_text_tools.egg-info/SOURCES.txt
|
|
11
|
+
complex_text_tools.egg-info/dependency_links.txt
|
|
12
|
+
complex_text_tools.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
complex_text_tools
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=45", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "complex-text-tools"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A package for processing complex text with mixed Chinese and English characters"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{name = "Your Name", email = "your.email@example.com"}
|
|
12
|
+
]
|
|
13
|
+
license = {text = "MIT"}
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.6",
|
|
21
|
+
"Programming Language :: Python :: 3.7",
|
|
22
|
+
"Programming Language :: Python :: 3.8",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
]
|
|
26
|
+
requires-python = ">=3.6"
|
|
27
|
+
dependencies = []
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://github.com/yourusername/complex-text-tools"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name="complex-text-tools",
|
|
8
|
+
version="0.1.0",
|
|
9
|
+
author="Your Name",
|
|
10
|
+
author_email="your.email@example.com",
|
|
11
|
+
description="A package for processing complex text with mixed Chinese and English characters",
|
|
12
|
+
long_description=long_description,
|
|
13
|
+
long_description_content_type="text/markdown",
|
|
14
|
+
url="https://github.com/yourusername/complex-text-tools",
|
|
15
|
+
packages=find_packages(),
|
|
16
|
+
classifiers=[
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.6",
|
|
23
|
+
"Programming Language :: Python :: 3.7",
|
|
24
|
+
"Programming Language :: Python :: 3.8",
|
|
25
|
+
"Programming Language :: Python :: 3.9",
|
|
26
|
+
"Programming Language :: Python :: 3.10",
|
|
27
|
+
],
|
|
28
|
+
python_requires=">=3.6",
|
|
29
|
+
install_requires=[
|
|
30
|
+
# 无额外依赖
|
|
31
|
+
],
|
|
32
|
+
)
|