PgsFile 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +38 -4
- PgsFile/__init__.py +3 -1
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.9.dist-info}/METADATA +3 -2
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.9.dist-info}/RECORD +7 -7
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.9.dist-info}/LICENSE +0 -0
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.9.dist-info}/WHEEL +0 -0
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.9.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -1165,12 +1165,25 @@ def run_command(command: str) -> str:
|
|
|
1165
1165
|
|
|
1166
1166
|
return output_str
|
|
1167
1167
|
|
|
1168
|
+
# Import the urllib.parse module to handle URL encoding
|
|
1169
|
+
import urllib.parse
|
|
1170
|
+
# Define a function to URL-encode a Chinese keyword
|
|
1171
|
+
def encode_chinese_keyword_for_url(chinese_keyword):
|
|
1172
|
+
# Use urllib.parse.quote to encode the Chinese keyword
|
|
1173
|
+
encoded_keyword = urllib.parse.quote(chinese_keyword)
|
|
1174
|
+
# Return the encoded keyword
|
|
1175
|
+
return encoded_keyword
|
|
1168
1176
|
|
|
1169
1177
|
import random
|
|
1170
1178
|
import requests
|
|
1171
1179
|
from lxml import html, etree
|
|
1172
1180
|
import pandas as pd
|
|
1173
1181
|
my_headers={"User-Agent": random.choice(yhd)}
|
|
1182
|
+
|
|
1183
|
+
from fake_useragent import UserAgent
|
|
1184
|
+
ua = UserAgent()
|
|
1185
|
+
headers = {"User-Agent": ua.random}
|
|
1186
|
+
|
|
1174
1187
|
class PGScraper(object):
|
|
1175
1188
|
def __init__(self):
|
|
1176
1189
|
self.pattern=[]
|
|
@@ -1334,8 +1347,8 @@ Showing download progress and speed when audio-visual files like MP4, MP3, JPG e
|
|
|
1334
1347
|
import time
|
|
1335
1348
|
from contextlib import closing
|
|
1336
1349
|
|
|
1337
|
-
def audiovisual_downloader(url, path):
|
|
1338
|
-
with closing(requests.get(url, stream=True, headers=
|
|
1350
|
+
def audiovisual_downloader(url, path, headers=None):
|
|
1351
|
+
with closing(requests.get(url, stream=True, headers=headers)) as r:
|
|
1339
1352
|
chunk_size=1024*10
|
|
1340
1353
|
content_size=int(r.headers['content-length'])
|
|
1341
1354
|
print('Initiating download...')
|
|
@@ -1622,7 +1635,7 @@ def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=Non
|
|
|
1622
1635
|
r.encoding="utf-8"
|
|
1623
1636
|
data=r.text
|
|
1624
1637
|
html=etree.HTML(data)
|
|
1625
|
-
return html
|
|
1638
|
+
return html, data
|
|
1626
1639
|
else:
|
|
1627
1640
|
print(r.status_code, "Can not find the page!")
|
|
1628
1641
|
return None
|
|
@@ -1638,8 +1651,9 @@ def find_table_with_most_rows(tables):
|
|
|
1638
1651
|
max_table_index=i
|
|
1639
1652
|
return max_table_index, max_rows if max_table_index!= -1 else None
|
|
1640
1653
|
|
|
1641
|
-
def
|
|
1654
|
+
def get_data_table_url(url, output_file, most_rows=True):
|
|
1642
1655
|
try:
|
|
1656
|
+
# Wrap the HTML string in a StringIO object
|
|
1643
1657
|
tables=pd.read_html(url)
|
|
1644
1658
|
if most_rows==False:
|
|
1645
1659
|
# 1. default: the first table
|
|
@@ -1649,6 +1663,26 @@ def get_data_table(url, output_file, most_rows=True):
|
|
|
1649
1663
|
target_table=find_table_with_most_rows(tables)[0] # (1, 32)
|
|
1650
1664
|
df=tables[target_table]
|
|
1651
1665
|
|
|
1666
|
+
df.to_excel(output_file, index=False)
|
|
1667
|
+
print(f"Data has been saved to {output_file}")
|
|
1668
|
+
except Exception as err:
|
|
1669
|
+
print(f"Errors found! {err}")
|
|
1670
|
+
return None
|
|
1671
|
+
|
|
1672
|
+
def get_data_table_html_string(html_string, output_file, most_rows=True):
|
|
1673
|
+
try:
|
|
1674
|
+
# Wrap the HTML string in a StringIO object
|
|
1675
|
+
from io import StringIO
|
|
1676
|
+
html_io = StringIO(html_string)
|
|
1677
|
+
tables=pd.read_html(html_io)
|
|
1678
|
+
if most_rows==False:
|
|
1679
|
+
# 1. default: the first table
|
|
1680
|
+
df=tables[0]
|
|
1681
|
+
else:
|
|
1682
|
+
# 2. get the table with most rows
|
|
1683
|
+
target_table=find_table_with_most_rows(tables)[0] # (1, 32)
|
|
1684
|
+
df=tables[target_table]
|
|
1685
|
+
|
|
1652
1686
|
df.to_excel(output_file, index=False)
|
|
1653
1687
|
print(f"Data has been saved to {output_file}")
|
|
1654
1688
|
except Exception as err:
|
PgsFile/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# 1. Web scraping
|
|
2
2
|
from .PgsFile import PGScraper
|
|
3
3
|
from .PgsFile import audiovisual_downloader
|
|
4
|
+
from .PgsFile import headers, encode_chinese_keyword_for_url
|
|
4
5
|
|
|
5
6
|
# 2. Package/library management
|
|
6
7
|
from .PgsFile import install_package, uninstall_package
|
|
@@ -9,7 +10,8 @@ from .PgsFile import run_script, run_command
|
|
|
9
10
|
# 3. Text data retrieval
|
|
10
11
|
from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
|
|
11
12
|
from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
|
|
12
|
-
from .PgsFile import get_data_html_online, get_data_html_offline
|
|
13
|
+
from .PgsFile import get_data_html_online, get_data_html_offline
|
|
14
|
+
from .PgsFile import get_data_table_url, get_data_table_html_string
|
|
13
15
|
|
|
14
16
|
# 4. Text data storage
|
|
15
17
|
from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PgsFile
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
|
|
5
|
-
Home-page: https://mp.weixin.qq.com/s/
|
|
5
|
+
Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
|
|
6
6
|
Author: Pan Guisheng
|
|
7
7
|
Author-email: 895284504@qq.com
|
|
8
8
|
License: Educational free
|
|
@@ -17,6 +17,7 @@ Requires-Dist: pandas
|
|
|
17
17
|
Requires-Dist: python-docx
|
|
18
18
|
Requires-Dist: pip
|
|
19
19
|
Requires-Dist: requests
|
|
20
|
+
Requires-Dist: fake-useragent
|
|
20
21
|
Requires-Dist: lxml
|
|
21
22
|
Requires-Dist: pimht
|
|
22
23
|
Requires-Dist: pysbd
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256=
|
|
1
|
+
PgsFile/PgsFile.py,sha256=Tm7-TkW4dpRtLGRR06t6jjsga8MJelu6BeirtvHURxc,79720
|
|
2
|
+
PgsFile/__init__.py,sha256=PJ8pJVly_6qNe4zEWp5Q5kLdy0rNcyilM-bbBemxhl4,2259
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
|
|
|
2618
2618
|
PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
|
|
2619
2619
|
PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
|
|
2620
2620
|
PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
|
|
2621
|
-
PgsFile-0.1.
|
|
2622
|
-
PgsFile-0.1.
|
|
2623
|
-
PgsFile-0.1.
|
|
2624
|
-
PgsFile-0.1.
|
|
2625
|
-
PgsFile-0.1.
|
|
2621
|
+
PgsFile-0.1.9.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2622
|
+
PgsFile-0.1.9.dist-info/METADATA,sha256=JAqaoghX_comHPuhW5pb3UskF65jKoJv9RTR0FYZbEA,4983
|
|
2623
|
+
PgsFile-0.1.9.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2624
|
+
PgsFile-0.1.9.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2625
|
+
PgsFile-0.1.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|