sd-spider-utils 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sd_spider_utils-1.0.0/LICENSE +21 -0
- sd_spider_utils-1.0.0/PKG-INFO +23 -0
- sd_spider_utils-1.0.0/README.rst +0 -0
- sd_spider_utils-1.0.0/pyproject.toml +29 -0
- sd_spider_utils-1.0.0/sd_spider_utils/__init__.py +0 -0
- sd_spider_utils-1.0.0/sd_spider_utils/common_spider.py +4 -0
- sd_spider_utils-1.0.0/sd_spider_utils/common_utils.py +22 -0
- sd_spider_utils-1.0.0/sd_spider_utils/data_utils.py +30 -0
- sd_spider_utils-1.0.0/sd_spider_utils/datetime_utils.py +72 -0
- sd_spider_utils-1.0.0/sd_spider_utils/parse_utils.py +36 -0
- sd_spider_utils-1.0.0/sd_spider_utils/spider_demos.py +12 -0
- sd_spider_utils-1.0.0/sd_spider_utils/text_utils.py +96 -0
- sd_spider_utils-1.0.0/sd_spider_utils/url_utils.py +0 -0
- sd_spider_utils-1.0.0/sd_spider_utils.egg-info/PKG-INFO +23 -0
- sd_spider_utils-1.0.0/sd_spider_utils.egg-info/SOURCES.txt +17 -0
- sd_spider_utils-1.0.0/sd_spider_utils.egg-info/dependency_links.txt +1 -0
- sd_spider_utils-1.0.0/sd_spider_utils.egg-info/requires.txt +2 -0
- sd_spider_utils-1.0.0/sd_spider_utils.egg-info/top_level.txt +1 -0
- sd_spider_utils-1.0.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 星梦
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sd_spider_utils
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: 爬虫工具包,写爬虫更快!!!
|
|
5
|
+
Author-email: 星梦 <cpython666@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://space.bilibili.com/1909782963
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Natural Language :: Chinese (Simplified)
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.5
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Requires-Python: >=3.5
|
|
19
|
+
Description-Content-Type: text/x-rst
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: requests
|
|
22
|
+
Requires-Dist: beautifulsoup4
|
|
23
|
+
Dynamic: license-file
|
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "sd_spider_utils"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "爬虫工具包,写爬虫更快!!!"
|
|
5
|
+
readme = "README.rst"
|
|
6
|
+
license = {text = "MIT"}
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "星梦", email = "cpython666@gmail.com" },
|
|
9
|
+
]
|
|
10
|
+
urls = { Homepage = "https://space.bilibili.com/1909782963" }
|
|
11
|
+
requires-python = ">=3.5"
|
|
12
|
+
dependencies = ["requests", "beautifulsoup4"]
|
|
13
|
+
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Natural Language :: Chinese (Simplified)",
|
|
18
|
+
"Programming Language :: Python",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.5",
|
|
21
|
+
"Programming Language :: Python :: 3.6",
|
|
22
|
+
"Programming Language :: Python :: 3.7",
|
|
23
|
+
"Programming Language :: Python :: 3.8",
|
|
24
|
+
"Topic :: Software Development :: Libraries",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["setuptools>=61.0"]
|
|
29
|
+
build-backend = "setuptools.build_meta"
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
def strtobool(value):
|
|
2
|
+
"""
|
|
3
|
+
常见字符串转布尔值
|
|
4
|
+
"""
|
|
5
|
+
_MAP = {
|
|
6
|
+
"y": True,
|
|
7
|
+
"yes": True,
|
|
8
|
+
"t": True,
|
|
9
|
+
"true": True,
|
|
10
|
+
"on": True,
|
|
11
|
+
"1": True,
|
|
12
|
+
"n": False,
|
|
13
|
+
"no": False,
|
|
14
|
+
"f": False,
|
|
15
|
+
"false": False,
|
|
16
|
+
"off": False,
|
|
17
|
+
"0": False,
|
|
18
|
+
}
|
|
19
|
+
try:
|
|
20
|
+
return _MAP[str(value).lower()]
|
|
21
|
+
except KeyError:
|
|
22
|
+
raise ValueError('"{}" is not a valid bool value'.format(value))
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
def json2excel(filepath):
|
|
2
|
+
"""
|
|
3
|
+
input: filepath: string
|
|
4
|
+
[
|
|
5
|
+
{"name":"小明","age",18},
|
|
6
|
+
...
|
|
7
|
+
]
|
|
8
|
+
"""
|
|
9
|
+
import os
|
|
10
|
+
import json
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
# 获取输入文件所在的文件夹路径
|
|
14
|
+
folder_path = os.path.dirname(filepath)
|
|
15
|
+
# 获取输入文件的文件名(不带扩展名)
|
|
16
|
+
file_name = os.path.splitext(os.path.basename(filepath))[0]
|
|
17
|
+
# 构建输出的 Excel 文件路径
|
|
18
|
+
output_filepath = os.path.join(folder_path, f"{file_name}.xlsx")
|
|
19
|
+
# 读取 JSON 文件
|
|
20
|
+
with open(filepath, "r", encoding="utf-8") as file:
|
|
21
|
+
obj_list = json.load(file)
|
|
22
|
+
# 将数据转换为 DataFrame
|
|
23
|
+
df = pd.DataFrame(obj_list)
|
|
24
|
+
# 将 DataFrame 保存为 Excel 文件
|
|
25
|
+
df.to_excel(output_filepath, index=False)
|
|
26
|
+
print(f"Excel 文件已保存:{output_filepath}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == "__main__":
|
|
30
|
+
json2excel(r"demo.json")
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def clean_text(text: str) -> str:
|
|
7
|
+
# 替换非断空白符为普通空格
|
|
8
|
+
text = text.replace("\xa0", " ")
|
|
9
|
+
# 移除字符串两端的空格
|
|
10
|
+
text = text.strip()
|
|
11
|
+
# 替换多个空格为单个空格
|
|
12
|
+
text = " ".join(text.split())
|
|
13
|
+
# 移除多余的标点符号,例如连续的逗号或逗号后面紧跟空格
|
|
14
|
+
text = text.replace(" ,", ",").replace(", ,", ",")
|
|
15
|
+
return text
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def contains_chinese(text: str) -> bool:
|
|
19
|
+
"""
|
|
20
|
+
使用正则表达式检查是否包含汉字
|
|
21
|
+
"""
|
|
22
|
+
return bool(re.search(r"[\u4e00-\u9fa5]", text))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def contains_date(text: str) -> bool:
|
|
26
|
+
"""
|
|
27
|
+
使用正则表达式检查是否包含类似 '2022年03月30日' 的日期
|
|
28
|
+
"""
|
|
29
|
+
return bool(re.search(r"\d{4}[-/年]\d{1,2}[-/月]\d{1,2}日?", text))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_dates(text: str) -> List[datetime]:
|
|
33
|
+
"""
|
|
34
|
+
提取文本中所有的日期并返回 datetime 对象列表
|
|
35
|
+
支持格式如: 2022年03月30日, 2022-03-30, 2022/03/30
|
|
36
|
+
"""
|
|
37
|
+
# 正则表达式匹配日期
|
|
38
|
+
pattern: str = r"\d{4}[-/年]\d{1,2}[-/月]\d{1,2}日?"
|
|
39
|
+
matches = re.findall(pattern, text)
|
|
40
|
+
|
|
41
|
+
dates: List[datetime] = []
|
|
42
|
+
for match in matches:
|
|
43
|
+
# 替换中文字符为标准分隔符
|
|
44
|
+
normalized = match.replace("年", "-").replace("月", "-").replace("日", "")
|
|
45
|
+
try:
|
|
46
|
+
# 将字符串转换为 datetime 对象
|
|
47
|
+
date_obj = datetime.strptime(normalized, "%Y-%m-%d")
|
|
48
|
+
dates.append(date_obj)
|
|
49
|
+
except ValueError:
|
|
50
|
+
# 如果转换失败,跳过这个匹配
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
return dates
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
# 测试日期提取
|
|
58
|
+
test_text: str = "今天是2022年03月30日,昨天是2022-03-29,明天是2022/03/31。"
|
|
59
|
+
dates: List[datetime] = extract_dates(test_text)
|
|
60
|
+
for date in dates:
|
|
61
|
+
print(f"Found date: {date.strftime('%Y-%m-%d')}") # 格式化输出日期
|
|
62
|
+
|
|
63
|
+
# 原有的测试代码
|
|
64
|
+
text1: str = "This is a te{|||| nmakldnsjdmksxm 15651654 st.把那家伙半小时·"
|
|
65
|
+
text2: str = "这是一个测试。"
|
|
66
|
+
print(contains_chinese(text1)) # False
|
|
67
|
+
print(contains_chinese(text2)) # True
|
|
68
|
+
|
|
69
|
+
text3: str = "今天是2022年03月30日,天气晴。"
|
|
70
|
+
text4: str = "这是一个没有日期的文本。"
|
|
71
|
+
print(contains_date(text3)) # True
|
|
72
|
+
print(contains_date(text4)) # False
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_text_bs4(html: str, remove_blank_lines: bool = False) -> str:
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
8
|
+
if remove_blank_lines:
|
|
9
|
+
return "\n".join(
|
|
10
|
+
line.strip() for line in soup.text.splitlines() if line.strip()
|
|
11
|
+
)
|
|
12
|
+
else:
|
|
13
|
+
return soup.text.strip()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_text_xpath(text: str) -> List[str]:
|
|
17
|
+
from lxml import etree
|
|
18
|
+
|
|
19
|
+
tree = etree.fromstring(text)
|
|
20
|
+
return tree.xpath("//text()")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_text_scrapy(text: str) -> str:
|
|
24
|
+
from scrapy import Selector
|
|
25
|
+
|
|
26
|
+
response = Selector(text=text)
|
|
27
|
+
return "".join(response.xpath("//text()").getall())
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if __name__ == "__main__":
|
|
31
|
+
text: str = (
|
|
32
|
+
"<div><p>这是段落1</p><p>这是段落2<em>2nsjkxabs</em><p>这是段落3</p></p></div>"
|
|
33
|
+
)
|
|
34
|
+
print(get_text_scrapy(text))
|
|
35
|
+
print(get_text_bs4(text))
|
|
36
|
+
# ['这是段落1', '这是段落2', '2nsjkxabs', '这是段落3']
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
def xpath_demo():
|
|
2
|
+
print(
|
|
3
|
+
"""
|
|
4
|
+
类名为 cpython666 的节点 response.xpath("//*[@class='cpython666']")
|
|
5
|
+
类名以 cpython666 开头的节点 response.xpath("//*[starts-with(@class, 'cpython666')]")
|
|
6
|
+
文本包含 cpython666 的h2标签的后续兄弟p标签 response.xpath("//h2[contains(string(.), 'cpython666')]/following-sibling::p")
|
|
7
|
+
"""
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
if __name__ == "__main__":
|
|
12
|
+
xpath_demo()
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def normal_text(text):
|
|
5
|
+
"""
|
|
6
|
+
normalize_unicode_text
|
|
7
|
+
文本标准化:转换为标准形式
|
|
8
|
+
"""
|
|
9
|
+
import unicodedata
|
|
10
|
+
|
|
11
|
+
normalized_text = unicodedata.normalize("NFKC", text)
|
|
12
|
+
return normalized_text
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def clean_text(text: str):
|
|
16
|
+
# 替换非断空白符为普通空格
|
|
17
|
+
text = text.replace("\xa0", " ")
|
|
18
|
+
# 移除字符串两端的空格
|
|
19
|
+
text = text.strip()
|
|
20
|
+
# 替换多个空格为单个空格
|
|
21
|
+
text = " ".join(text.split())
|
|
22
|
+
# 移除多余的标点符号,例如连续的逗号或逗号后面紧跟空格
|
|
23
|
+
text = text.replace(" ,", ",").replace(", ,", ",")
|
|
24
|
+
return text
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def remove_extra_spaces(text: str) -> str:
|
|
28
|
+
"""
|
|
29
|
+
移除字符串中的多个空格为单个空格。
|
|
30
|
+
:param text: 输入字符串
|
|
31
|
+
:return: 处理后的字符串
|
|
32
|
+
# 示例
|
|
33
|
+
text = "This is a text with multiple spaces."
|
|
34
|
+
cleaned_text = remove_extra_spaces(text)
|
|
35
|
+
print(cleaned_text) # 输出: "This is a text with multiple spaces."
|
|
36
|
+
"""
|
|
37
|
+
return " ".join(text.split())
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def remove_extra_blank_spaces(text: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
移除字符串中的多余空白字符(空格、制表符、换行符等),
|
|
43
|
+
并将其替换为单个空格。
|
|
44
|
+
|
|
45
|
+
:param text: 输入字符串
|
|
46
|
+
:return: 处理后的字符串
|
|
47
|
+
|
|
48
|
+
# 示例
|
|
49
|
+
text = "This is a text with multiple spaces."
|
|
50
|
+
cleaned_text = remove_extra_spaces(text)
|
|
51
|
+
print(cleaned_text) # 输出: "This is a text with multiple spaces."
|
|
52
|
+
"""
|
|
53
|
+
import re
|
|
54
|
+
|
|
55
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def contains_chinese(text: str) -> bool:
|
|
59
|
+
"""
|
|
60
|
+
使用正则表达式检查是否包含汉字
|
|
61
|
+
"""
|
|
62
|
+
import re
|
|
63
|
+
|
|
64
|
+
return bool(re.search(r"[\u4e00-\u9fa5]", text))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def contains_date(text: str) -> bool:
|
|
68
|
+
"""
|
|
69
|
+
使用正则表达式检查是否包含类似 '2022年03月30日' 的日期
|
|
70
|
+
"""
|
|
71
|
+
import re
|
|
72
|
+
|
|
73
|
+
return bool(re.search(r"\d{4}[-/年]\d{1,2}[-/月]\d{1,2}日?", text))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
text = "This is a text with multiple spaces."
|
|
78
|
+
# 使用正则表达式替换一个或多个空白字符(包括空格、制表符、换行符等)为一个空格
|
|
79
|
+
cleaned_text = re.sub(r"\s+", " ", text).strip()
|
|
80
|
+
|
|
81
|
+
print(cleaned_text)
|
|
82
|
+
|
|
83
|
+
text1 = "This is a te{|||| nmakldnsjdmksxm 15651654 st.把那家伙半小时·"
|
|
84
|
+
text2 = "这是一个测试。"
|
|
85
|
+
|
|
86
|
+
print(contains_chinese(text1)) # False
|
|
87
|
+
print(contains_chinese(text2)) # True
|
|
88
|
+
|
|
89
|
+
# 测试
|
|
90
|
+
text1 = "今天是2022年03月30日,天气晴。"
|
|
91
|
+
text2 = "这是一个没有日期的文本。"
|
|
92
|
+
|
|
93
|
+
print(contains_date(text1)) # True
|
|
94
|
+
print(contains_date(text2)) # False
|
|
95
|
+
text = "Café['S.\u2009M. Koksbang\xa0', 'S.\u2009M. Koksbang']" # 包含全角字符和组合字符
|
|
96
|
+
normalize_unicode_text(text)
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sd_spider_utils
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: 爬虫工具包,写爬虫更快!!!
|
|
5
|
+
Author-email: 星梦 <cpython666@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://space.bilibili.com/1909782963
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Natural Language :: Chinese (Simplified)
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.5
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Requires-Python: >=3.5
|
|
19
|
+
Description-Content-Type: text/x-rst
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: requests
|
|
22
|
+
Requires-Dist: beautifulsoup4
|
|
23
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.rst
|
|
3
|
+
pyproject.toml
|
|
4
|
+
sd_spider_utils/__init__.py
|
|
5
|
+
sd_spider_utils/common_spider.py
|
|
6
|
+
sd_spider_utils/common_utils.py
|
|
7
|
+
sd_spider_utils/data_utils.py
|
|
8
|
+
sd_spider_utils/datetime_utils.py
|
|
9
|
+
sd_spider_utils/parse_utils.py
|
|
10
|
+
sd_spider_utils/spider_demos.py
|
|
11
|
+
sd_spider_utils/text_utils.py
|
|
12
|
+
sd_spider_utils/url_utils.py
|
|
13
|
+
sd_spider_utils.egg-info/PKG-INFO
|
|
14
|
+
sd_spider_utils.egg-info/SOURCES.txt
|
|
15
|
+
sd_spider_utils.egg-info/dependency_links.txt
|
|
16
|
+
sd_spider_utils.egg-info/requires.txt
|
|
17
|
+
sd_spider_utils.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sd_spider_utils
|