PgsFile 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +30 -4
- PgsFile/__init__.py +3 -1
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/METADATA +2 -1
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/RECORD +7 -7
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/LICENSE +0 -0
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/WHEEL +0 -0
- {PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -1171,6 +1171,11 @@ import requests
|
|
|
1171
1171
|
from lxml import html, etree
|
|
1172
1172
|
import pandas as pd
|
|
1173
1173
|
my_headers={"User-Agent": random.choice(yhd)}
|
|
1174
|
+
|
|
1175
|
+
from fake_useragent import UserAgent
|
|
1176
|
+
ua = UserAgent()
|
|
1177
|
+
headers = {"User-Agent": ua.random}
|
|
1178
|
+
|
|
1174
1179
|
class PGScraper(object):
|
|
1175
1180
|
def __init__(self):
|
|
1176
1181
|
self.pattern=[]
|
|
@@ -1334,8 +1339,8 @@ Showing download progress and speed when audio-visual files like MP4, MP3, JPG e
|
|
|
1334
1339
|
import time
|
|
1335
1340
|
from contextlib import closing
|
|
1336
1341
|
|
|
1337
|
-
def audiovisual_downloader(url, path):
|
|
1338
|
-
with closing(requests.get(url, stream=True, headers=
|
|
1342
|
+
def audiovisual_downloader(url, path, headers=None):
|
|
1343
|
+
with closing(requests.get(url, stream=True, headers=headers)) as r:
|
|
1339
1344
|
chunk_size=1024*10
|
|
1340
1345
|
content_size=int(r.headers['content-length'])
|
|
1341
1346
|
print('Initiating download...')
|
|
@@ -1622,7 +1627,7 @@ def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=Non
|
|
|
1622
1627
|
r.encoding="utf-8"
|
|
1623
1628
|
data=r.text
|
|
1624
1629
|
html=etree.HTML(data)
|
|
1625
|
-
return html
|
|
1630
|
+
return html, data
|
|
1626
1631
|
else:
|
|
1627
1632
|
print(r.status_code, "Can not find the page!")
|
|
1628
1633
|
return None
|
|
@@ -1638,8 +1643,9 @@ def find_table_with_most_rows(tables):
|
|
|
1638
1643
|
max_table_index=i
|
|
1639
1644
|
return max_table_index, max_rows if max_table_index!= -1 else None
|
|
1640
1645
|
|
|
1641
|
-
def
|
|
1646
|
+
def get_data_table_url(url, output_file, most_rows=True):
|
|
1642
1647
|
try:
|
|
1648
|
+
# Wrap the HTML string in a StringIO object
|
|
1643
1649
|
tables=pd.read_html(url)
|
|
1644
1650
|
if most_rows==False:
|
|
1645
1651
|
# 1. default: the first table
|
|
@@ -1649,6 +1655,26 @@ def get_data_table(url, output_file, most_rows=True):
|
|
|
1649
1655
|
target_table=find_table_with_most_rows(tables)[0] # (1, 32)
|
|
1650
1656
|
df=tables[target_table]
|
|
1651
1657
|
|
|
1658
|
+
df.to_excel(output_file, index=False)
|
|
1659
|
+
print(f"Data has been saved to {output_file}")
|
|
1660
|
+
except Exception as err:
|
|
1661
|
+
print(f"Errors found! {err}")
|
|
1662
|
+
return None
|
|
1663
|
+
|
|
1664
|
+
def get_data_table_html_string(html_string, output_file, most_rows=True):
|
|
1665
|
+
try:
|
|
1666
|
+
# Wrap the HTML string in a StringIO object
|
|
1667
|
+
from io import StringIO
|
|
1668
|
+
html_io = StringIO(html_string)
|
|
1669
|
+
tables=pd.read_html(html_io)
|
|
1670
|
+
if most_rows==False:
|
|
1671
|
+
# 1. default: the first table
|
|
1672
|
+
df=tables[0]
|
|
1673
|
+
else:
|
|
1674
|
+
# 2. get the table with most rows
|
|
1675
|
+
target_table=find_table_with_most_rows(tables)[0] # (1, 32)
|
|
1676
|
+
df=tables[target_table]
|
|
1677
|
+
|
|
1652
1678
|
df.to_excel(output_file, index=False)
|
|
1653
1679
|
print(f"Data has been saved to {output_file}")
|
|
1654
1680
|
except Exception as err:
|
PgsFile/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# 1. Web scraping
|
|
2
2
|
from .PgsFile import PGScraper
|
|
3
3
|
from .PgsFile import audiovisual_downloader
|
|
4
|
+
from .PgsFile import headers
|
|
4
5
|
|
|
5
6
|
# 2. Package/library management
|
|
6
7
|
from .PgsFile import install_package, uninstall_package
|
|
@@ -9,7 +10,8 @@ from .PgsFile import run_script, run_command
|
|
|
9
10
|
# 3. Text data retrieval
|
|
10
11
|
from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
|
|
11
12
|
from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
|
|
12
|
-
from .PgsFile import get_data_html_online, get_data_html_offline
|
|
13
|
+
from .PgsFile import get_data_html_online, get_data_html_offline
|
|
14
|
+
from .PgsFile import get_data_table_url, get_data_table_html_string
|
|
13
15
|
|
|
14
16
|
# 4. Text data storage
|
|
15
17
|
from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PgsFile
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
|
|
5
5
|
Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
|
|
6
6
|
Author: Pan Guisheng
|
|
@@ -17,6 +17,7 @@ Requires-Dist: pandas
|
|
|
17
17
|
Requires-Dist: python-docx
|
|
18
18
|
Requires-Dist: pip
|
|
19
19
|
Requires-Dist: requests
|
|
20
|
+
Requires-Dist: fake-useragent
|
|
20
21
|
Requires-Dist: lxml
|
|
21
22
|
Requires-Dist: pimht
|
|
22
23
|
Requires-Dist: pysbd
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256=
|
|
1
|
+
PgsFile/PgsFile.py,sha256=70VLJybFPPbaxPxb5z_vnbEZG9fVgkO7nxcxI77auUI,79354
|
|
2
|
+
PgsFile/__init__.py,sha256=vJqj1rfxK-2UrS4m3nCg0yglqsF_8P-DPAnDgxTdAj8,2227
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
|
|
|
2618
2618
|
PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
|
|
2619
2619
|
PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
|
|
2620
2620
|
PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
|
|
2621
|
-
PgsFile-0.1.
|
|
2622
|
-
PgsFile-0.1.
|
|
2623
|
-
PgsFile-0.1.
|
|
2624
|
-
PgsFile-0.1.
|
|
2625
|
-
PgsFile-0.1.
|
|
2621
|
+
PgsFile-0.1.8.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2622
|
+
PgsFile-0.1.8.dist-info/METADATA,sha256=oSsVQL9ZZBLhk83DN22SrtrSh3r_V8D7ILYfHv1SJbo,4955
|
|
2623
|
+
PgsFile-0.1.8.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2624
|
+
PgsFile-0.1.8.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2625
|
+
PgsFile-0.1.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|