oafuncs 0.0.97.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oafuncs/oa_down/idm.py ADDED
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ Author: Liu Kun && 16031215@qq.com
5
+ Date: 2025-01-11 16:19:12
6
+ LastEditors: Liu Kun && 16031215@qq.com
7
+ LastEditTime: 2025-01-11 16:25:47
8
+ FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\idm.py
9
+ Description:
10
+ EditPlatform: vscode
11
+ ComputerInfo: XPS 15 9510
12
+ SystemInfo: Windows 11
13
+ Python Version: 3.12
14
+ """
15
+
16
+ import datetime
17
+ import os
18
+ from subprocess import call
19
+
20
+ from rich import print
21
+
22
+ __all__ = ["downloader"]
23
+
24
+
25
+ def downloader(task_url, folder_path, file_name, idm_engine=r"D:\Programs\Internet Download Manager\IDMan.exe"):
26
+ """
27
+ Description:
28
+ Use IDM to download files.
29
+ Parameter:
30
+ task_url: str
31
+ The download link of the file.
32
+ folder_path: str
33
+ The path of the folder where the file is saved.
34
+ file_name: str
35
+ The name of the file to be saved.
36
+ idm_engine: str
37
+ The path of the IDM engine. Note: "IDMan.exe"
38
+ Return:
39
+ None
40
+ Example:
41
+ downloader("https://www.test.com/data.nc", "E:\\Data", "test.nc", "D:\\Programs\\Internet Download Manager\\IDMan.exe")
42
+ """
43
+ os.makedirs(folder_path, exist_ok=True)
44
+ # 将任务添加至队列
45
+ call([idm_engine, "/d", task_url, "/p", folder_path, "/f", file_name, "/a"])
46
+ # 开始任务队列
47
+ call([idm_engine, "/s"])
48
+ # print(f"IDM下载器:{file_name}下载任务已添加至队列...")
49
+ print("[purple]-" * 50 + f"\n{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" + "[purple]-" * 50)
50
+ print(f"[green]IDM Downloader: {file_name} download task has been added to the queue...[/green]")
@@ -0,0 +1,288 @@
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ Author: Liu Kun && 16031215@qq.com
5
+ Date: 2024-11-28 10:42:56
6
+ LastEditors: Liu Kun && 16031215@qq.com
7
+ LastEditTime: 2025-01-05 10:51:42
8
+ FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\literature.py
9
+ Description:
10
+ EditPlatform: vscode
11
+ ComputerInfo: XPS 15 9510
12
+ SystemInfo: Windows 11
13
+ Python Version: 3.12
14
+ """
15
+
16
+ import os
17
+ import re
18
+ import time
19
+ from pathlib import Path
20
+
21
+ import pandas as pd
22
+ import requests
23
+ from rich import print
24
+ from rich.progress import track
25
+ from oafuncs.oa_down.user_agent import get_ua
26
+ from oafuncs.oa_file import remove
27
+ from oafuncs.oa_data import ensure_list
28
+
29
+ __all__ = ["download5doi"]
30
+
31
+
32
+ def _get_file_size(file_path, unit="KB"):
33
+ # 检查文件是否存在
34
+ if not os.path.exists(file_path):
35
+ return "文件不存在"
36
+
37
+ # 获取文件大小(字节)
38
+ file_size = os.path.getsize(file_path)
39
+
40
+ # 单位转换字典
41
+ unit_dict = {
42
+ "PB": 1024**5,
43
+ "TB": 1024**4,
44
+ "GB": 1024**3,
45
+ "MB": 1024**2,
46
+ "KB": 1024,
47
+ }
48
+
49
+ # 检查传入的单位是否合法
50
+ if unit not in unit_dict:
51
+ return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
52
+
53
+ # 转换文件大小到指定单位
54
+ converted_size = file_size / unit_dict[unit]
55
+
56
+ return converted_size
57
+
58
+
59
+ class _Downloader:
60
+ """
61
+ 根据doi下载文献pdf
62
+ """
63
+
64
+ def __init__(self, doi, store_path):
65
+ self.url_list = [
66
+ r"https://sci-hub.se",
67
+ r"https://sci-hub.ren",
68
+ r"https://sci-hub.st",
69
+ r"https://sci-hub.ru", # 最好用的一个网站
70
+ # ------------------------------------- 以下网站没验证
71
+ r"https://sci-hub.wf",
72
+ r"https://sci-hub.yt",
73
+ r"https://sci-hub.ee",
74
+ r"https://sci-hub.cat",
75
+ r"https://sci-hub.in",
76
+ r"https://www.pismin.com",
77
+ r"https://sci-hub.vkif.top",
78
+ r"https://www.bothonce.com",
79
+ r"https://sci-hub.et-fine.com",
80
+ r"https://sci-hub.hkvisa.net",
81
+ # r"https://sci-hub.3800808.com", # 这个只能手动保存
82
+ r"https://sci-hub.zidianzhan.net",
83
+ r"https://sci-hub.usualwant.com",
84
+ ]
85
+ self.base_url = None
86
+ self.url = None
87
+ self.doi = doi
88
+ self.pdf_url = None
89
+ self.pdf_path = None
90
+ self.headers = {"User-Agent": get_ua().encode("utf-8")}
91
+ # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
92
+ # self.fname = doi.replace(r'/', '_') + '.pdf'
93
+ self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
94
+ self.store_path = Path(store_path)
95
+ self.fpath = self.store_path / self.fname
96
+ self.wrong_record_file = self.store_path / "wrong_record.txt"
97
+ self.sleep = 5
98
+ self.cookies = None
99
+ self.check_size = 50
100
+ self.url_index = 0
101
+ self.try_times_each_url_max = 3
102
+ self.try_times = 0
103
+
104
+ def get_pdf_url(self):
105
+ print("[bold #E6E6FA]-" * 120)
106
+ print(f"DOI: {self.doi}")
107
+ print(f"Requesting: {self.url}...")
108
+ try:
109
+ response = requests.get(self.url, headers=self.headers)
110
+ if response.status_code == 200:
111
+ self.cookies = response.cookies
112
+ text = response.text.replace("\\", "")
113
+ # text = text.replace(' ', '') # It is important to remove the space
114
+ # print(text)
115
+ pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
116
+ match = pattern.search(text)
117
+ if match:
118
+ got_url = match.group(1)
119
+ if r"http" not in got_url:
120
+ if got_url[:2] == "//":
121
+ self.pdf_url = "https:" + got_url
122
+ else:
123
+ self.pdf_url = self.base_url + got_url
124
+ else:
125
+ self.pdf_url = got_url
126
+ print(f"URL: {self.pdf_url}")
127
+ else:
128
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
129
+ self.try_times = self.try_times_each_url_max + 1
130
+ else:
131
+ print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
132
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
133
+ self.try_times = self.try_times_each_url_max + 1
134
+ except Exception as e:
135
+ print(f"Failed to retrieve the webpage. Error: {e}")
136
+ self.try_times = self.try_times_each_url_max + 1
137
+
138
+ def url_iterate(self):
139
+ if self.url_index >= len(self.url_list):
140
+ return
141
+ url = self.url_list[self.url_index]
142
+ self.base_url = url
143
+ self.url = url + "/" + self.doi
144
+ self.get_pdf_url()
145
+ # for url in self.url_list:
146
+ # self.url = url + self.doi
147
+ # self.get_pdf_url()
148
+ # if self.pdf_url:
149
+ # break
150
+
151
+ def write_wrong_record(self):
152
+ # 先读取txt中的内容,如果已经存在则不再写入
153
+ if self.wrong_record_file.exists():
154
+ with open(self.wrong_record_file, "r") as f:
155
+ lines = f.readlines()
156
+ if self.doi in lines:
157
+ return
158
+ with open(self.wrong_record_file, "a") as f:
159
+ f.write(self.doi + "\n")
160
+
161
+ def download_pdf(self):
162
+ if self.fpath.exists():
163
+ fsize = _get_file_size(self.fpath, unit="KB")
164
+ if fsize < self.check_size:
165
+ # delete the wrong file
166
+ os.remove(self.fpath)
167
+ print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
168
+ else:
169
+ print("[bold #E6E6FA]-" * 120)
170
+ print(f"[bold purple]The PDF file {self.fpath} already exists.")
171
+ return
172
+ self.url_index = 0
173
+ already_downloaded = False
174
+ self.try_times = 0
175
+ while not already_downloaded:
176
+ self.url_iterate()
177
+ if not self.pdf_url:
178
+ self.url_index += 1
179
+ if self.url_index >= len(self.url_list):
180
+ print("Failed to download the PDF file.")
181
+ self.write_wrong_record()
182
+ return
183
+ else:
184
+ self.try_times = 0
185
+ continue
186
+ else:
187
+ self.try_times += 1
188
+ if self.try_times > self.try_times_each_url_max:
189
+ self.url_index += 1
190
+ if self.url_index >= len(self.url_list):
191
+ # print("Failed to download the PDF file.")
192
+ self.write_wrong_record()
193
+ return
194
+ print(f"Downloading: {self.fname}...")
195
+ try:
196
+ response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
197
+ if response.status_code == 200:
198
+ with open(self.fpath, "wb") as f:
199
+ f.write(response.content)
200
+ fsize = _get_file_size(self.fpath, unit="KB")
201
+ if fsize < self.check_size:
202
+ # delete the wrong file
203
+ os.remove(self.fpath)
204
+ print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
205
+ else:
206
+ print(f"[bold green]Sucessful to download {self.fpath}")
207
+ already_downloaded = True
208
+ else:
209
+ self.try_times = self.try_times_each_url_max + 1
210
+ print(f"Failed to download the PDF file. Status code: {response.status_code}")
211
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
212
+ except Exception as e:
213
+ print(f"Failed to download the PDF file. Error: {e}")
214
+ time.sleep(self.sleep)
215
+ if self.try_times >= self.try_times_each_url_max:
216
+ self.url_index += 1
217
+ if self.url_index >= len(self.url_list):
218
+ print("\n[bold #CD5C5C]Failed to download the PDF file.")
219
+ self.write_wrong_record()
220
+ return
221
+ if self.try_times == self.try_times_each_url_max:
222
+ print(f"Tried {self.try_times} times for {self.url_list[self.url_index-1]}.")
223
+ print("Try another URL...")
224
+
225
+
226
+ def _read_excel(file, col_name=r"DOI"):
227
+ df = pd.read_excel(file)
228
+ df_list = df[col_name].tolist()
229
+ # 去掉nan
230
+ df_list = [doi for doi in df_list if str(doi) != "nan"]
231
+ return df_list
232
+
233
+
234
+ def _read_txt(file):
235
+ with open(file, "r") as f:
236
+ lines = f.readlines()
237
+ # 去掉换行符以及空行
238
+ lines = [line.strip() for line in lines if line.strip()]
239
+ return lines
240
+
241
+
242
+ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r"DOI"):
243
+ """
244
+ Description:
245
+ Download PDF files by DOI.
246
+
247
+ Parameters:
248
+ store_path: str, The path to store the PDF files.
249
+ doi_list: list or str, The list of DOIs.
250
+ txt_file: str, The path of the txt file that contains the DOIs.
251
+ excel_file: str, The path of the excel file that contains the DOIs.
252
+ col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
253
+
254
+ Returns:
255
+ None
256
+
257
+ Example:
258
+ download5doi(doi_list='10.3389/feart.2021.698876')
259
+ download5doi(store_path='I:\\Delete\\ref_pdf', doi_list='10.3389/feart.2021.698876')
260
+ download5doi(store_path='I:\\Delete\\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
261
+ download5doi(store_path='I:\\Delete\\ref_pdf', txt_file='I:\\Delete\\ref_pdf\\wrong_record.txt')
262
+ download5doi(store_path='I:\\Delete\\ref_pdf', excel_file='I:\\Delete\\ref_pdf\\wrong_record.xlsx')
263
+ download5doi(store_path='I:\\Delete\\ref_pdf', excel_file='I:\\Delete\\ref_pdf\\wrong_record.xlsx', col_name='DOI')
264
+ """
265
+ if not store_path:
266
+ store_path = Path.cwd()
267
+ else:
268
+ store_path = Path(str(store_path))
269
+ store_path.mkdir(parents=True, exist_ok=True)
270
+ store_path = str(store_path)
271
+
272
+ if doi_list:
273
+ doi_list = ensure_list(doi_list)
274
+ if txt_file:
275
+ doi_list = _read_txt(txt_file)
276
+ if excel_file:
277
+ doi_list = _read_excel(excel_file, col_name)
278
+ remove(Path(store_path) / "wrong_record.txt")
279
+ print(f"Downloading {len(doi_list)} PDF files...")
280
+ for doi in track(doi_list, description="Downloading..."):
281
+ download = _Downloader(doi, store_path)
282
+ download.download_pdf()
283
+
284
+
285
+ if __name__ == "__main__":
286
+ store_path = r"F:\AAA-Delete\DOI_Reference\5\pdf"
287
+ excel_file = r"F:\AAA-Delete\DOI_Reference\5\savedrecs.xls"
288
+ download5doi(store_path, excel_file=excel_file)
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ Author: Liu Kun && 16031215@qq.com
5
+ Date: 2024-12-01 19:32:25
6
+ LastEditors: Liu Kun && 16031215@qq.com
7
+ LastEditTime: 2024-12-10 11:16:36
8
+ FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\test.py
9
+ Description:
10
+ EditPlatform: vscode
11
+ ComputerInfo: XPS 15 9510
12
+ SystemInfo: Windows 11
13
+ Python Version: 3.12
14
+ """
15
+
16
+ import os
17
+ import random
18
+ import re
19
+
20
+
21
+ def is_valid_user_agent(user_agent):
22
+ # 简单的正则表达式来检查User Agent的格式
23
+ # 这个正则表达式检查User Agent是否包含常见的浏览器信息格式
24
+ pattern = re.compile(
25
+ r"^(?:(?:Mozilla|Opera|Chrome|Safari|Edg|OPR)/[\d.]+)"
26
+ r"(?:\s(?:\(.*?\)))?"
27
+ r"(?:\s(?:Gecko|AppleWebKit|KHTML, like Gecko|Version|Edge|OPR)/[\d.]+)?"
28
+ r"(?:\s.*?(?:rv:|Version/|Ubuntu|Macintosh|Windows|X11|Linux|CrOS|FreeBSD|OpenBSD|NetBSD|iPhone|iPad|iPod|Android|BlackBerry|BB10|Mobile|Symbian|Windows Phone|IEMobile|Opera Mini|Opera Mobi|UCBrowser|MQQBrowser|baiduboxapp|baidubrowser|Safari|Firefox|MSIE|Trident|Edge|EdgA|Chrome|CriOS|Vivaldi|Sleipnir|Midori|ELinks|Lynx|w3m|Arora|Epiphany|Konqueror|Dillo|Netscape|SeaMonkey|K-Meleon|Camino|Iceape|Galeon|GranParadiso|Iceweasel|Firefox|Fennec|Conkeror|PaleMoon|Uzbl|QupZilla|Otter|Waterfox|Basilisk|Cyberfox|PaleMoon|GNU IceCat|GNU IceWeasel|IceCat|IceWeasel|Seamonkey|Iceape|Firefox|Epiphany|Web|Safari|Android|Mobile|BlackBerry|BB10|Tablet|Silk|Kindle|FxiOS|Focus|SamsungBrowser|browser|AppleWebKit|Puffin|DuckDuckGo|YaBrowser|Yandex|Amigo|NokiaBrowser|OviBrowser|OneBrowser|Chrome|Firefox|Safari|OPR|Coast|Mercury|Silk|Skyfire|IEMobile|Bolt|Jasmine|NativeHost|Crosswalk|TizenBrowser|SailfishBrowser|SamsungBrowser|Silk-Accelerated|UCBrowser|Quark|XiaoMi|OnePlus|Vivo|Oppo|Realme|Meizu|Lenovo|Huawei|ZTE|Alcatel|Sony|Nokia|LG|HTC|Asus|Acer|Motorola|Samsung)/[\d.]+)?$"
29
+ )
30
+
31
+ # 使用正则表达式匹配User Agent字符串
32
+ if pattern.match(user_agent):
33
+ return True
34
+ else:
35
+ return False
36
+
37
+
38
+ def get_ua():
39
+ current_dir = os.path.dirname(os.path.abspath(__file__))
40
+ ua_file_txt = os.path.join(current_dir, "User_Agent-list.txt")
41
+
42
+ with open(ua_file_txt, "r") as f:
43
+ ua_list = f.readlines()
44
+ # 去掉换行符和空行
45
+ ua_list = [line.strip() for line in ua_list if line.strip()]
46
+
47
+ return random.choice(ua_list)
48
+
49
+
50
+ def get_ua_org():
51
+ ua_list = [
52
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
53
+ "Opera/8.0 (Windows NT 5.1; U; en)",
54
+ "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
55
+ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
56
+ "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
57
+ "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
58
+ "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
59
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
60
+ "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
61
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
62
+ "Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
63
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
64
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
65
+ "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
66
+ "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
67
+ "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
68
+ "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
69
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
70
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
71
+ "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
72
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
73
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
74
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
75
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
76
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
77
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
78
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
79
+ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
80
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
81
+ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
82
+ "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
83
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
84
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
85
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
86
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
87
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
88
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36",
89
+ "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
90
+ "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
91
+ "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
92
+ "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
93
+ "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
94
+ "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
95
+ "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
96
+ "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
97
+ "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
98
+ "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
99
+ "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
100
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
101
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
102
+ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
103
+ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
104
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
105
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
106
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
107
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
108
+ "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
109
+ "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
110
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
111
+ "UCWEB7.0.2.37/28/999",
112
+ "NOKIA5700/UCWEB7.0.2.37/28/999",
113
+ "Openwave/UCWEB7.0.2.37/28/999",
114
+ "Openwave/UCWEB7.0.2.37/28/999",
115
+ ]
116
+ with open(newtxtfile, "w") as f:
117
+ for line in ua_list:
118
+ f.write(line + "\n")
119
+ # print(f'Using User-Agent: {ua}')
120
+ ua = random.choice(ua_list)
121
+ return ua
122
+
123
+
124
+ # get_ua_org()
125
+
126
+ if __name__ == "__main__":
127
+ txtfile = r"E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\User_Agent-list.txt"
128
+
129
+ with open(txtfile, "r") as f:
130
+ lines = f.readlines()
131
+ # 去掉换行符和空行
132
+ lines = [line.strip() for line in lines if line.strip()]
133
+ """ new_line = []
134
+ for i in range(len(lines)):
135
+ if '/' in lines[i]:
136
+ new_line.append(lines[i])
137
+ else:
138
+ print(lines[i]) """
139
+
140
+ new_line = []
141
+ for line in lines:
142
+ if is_valid_user_agent(line):
143
+ # print(line)
144
+ new_line.append(line)
145
+ else:
146
+ print(f"Invalid User-Agent: {line}")
147
+
148
+ newtxtfile = r"E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\ua_list_new.txt"
149
+ with open(newtxtfile, "w") as f:
150
+ for line in new_line:
151
+ f.write(line + "\n")
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ Author: Liu Kun && 16031215@qq.com
5
+ Date: 2024-12-26 08:06:34
6
+ LastEditors: Liu Kun && 16031215@qq.com
7
+ LastEditTime: 2024-12-26 08:06:34
8
+ FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\user_agent.py
9
+ Description:
10
+ EditPlatform: vscode
11
+ ComputerInfo: XPS 15 9510
12
+ SystemInfo: Windows 11
13
+ Python Version: 3.12
14
+ """
15
+ import os
16
+ import random
17
+
18
+
19
+ __all__ = ["get_ua"]
20
+
21
+
22
+ def get_ua():
23
+ current_dir = os.path.dirname(os.path.abspath(__file__))
24
+ ua_file_txt = os.path.join(current_dir, "User_Agent-list.txt")
25
+
26
+ with open(ua_file_txt, "r") as f:
27
+ ua_list = f.readlines()
28
+ # 去掉换行符和空行
29
+ ua_list = [line.strip() for line in ua_list if line.strip()]
30
+
31
+ return random.choice(ua_list)