PgsFile 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

PgsFile/PgsFile.py CHANGED
@@ -1165,12 +1165,25 @@ def run_command(command: str) -> str:
1165
1165
 
1166
1166
  return output_str
1167
1167
 
1168
+ # Import the urllib.parse module to handle URL encoding
1169
+ import urllib.parse
1170
+ # Define a function to URL-encode a Chinese keyword
1171
+ def encode_chinese_keyword_for_url(chinese_keyword):
1172
+ # Use urllib.parse.quote to encode the Chinese keyword
1173
+ encoded_keyword = urllib.parse.quote(chinese_keyword)
1174
+ # Return the encoded keyword
1175
+ return encoded_keyword
1168
1176
 
1169
1177
  import random
1170
1178
  import requests
1171
1179
  from lxml import html, etree
1172
1180
  import pandas as pd
1173
1181
  my_headers={"User-Agent": random.choice(yhd)}
1182
+
1183
+ from fake_useragent import UserAgent
1184
+ ua = UserAgent()
1185
+ headers = {"User-Agent": ua.random}
1186
+
1174
1187
  class PGScraper(object):
1175
1188
  def __init__(self):
1176
1189
  self.pattern=[]
@@ -1334,8 +1347,8 @@ Showing download progress and speed when audio-visual files like MP4, MP3, JPG e
1334
1347
  import time
1335
1348
  from contextlib import closing
1336
1349
 
1337
- def audiovisual_downloader(url, path):
1338
- with closing(requests.get(url, stream=True, headers=my_headers)) as r:
1350
+ def audiovisual_downloader(url, path, headers=None):
1351
+ with closing(requests.get(url, stream=True, headers=headers)) as r:
1339
1352
  chunk_size=1024*10
1340
1353
  content_size=int(r.headers['content-length'])
1341
1354
  print('Initiating download...')
@@ -1622,7 +1635,7 @@ def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=Non
1622
1635
  r.encoding="utf-8"
1623
1636
  data=r.text
1624
1637
  html=etree.HTML(data)
1625
- return html
1638
+ return html, data
1626
1639
  else:
1627
1640
  print(r.status_code, "Can not find the page!")
1628
1641
  return None
@@ -1638,8 +1651,9 @@ def find_table_with_most_rows(tables):
1638
1651
  max_table_index=i
1639
1652
  return max_table_index, max_rows if max_table_index!= -1 else None
1640
1653
 
1641
- def get_data_table(url, output_file, most_rows=True):
1654
+ def get_data_table_url(url, output_file, most_rows=True):
1642
1655
  try:
1656
+ # Wrap the HTML string in a StringIO object
1643
1657
  tables=pd.read_html(url)
1644
1658
  if most_rows==False:
1645
1659
  # 1. default: the first table
@@ -1649,6 +1663,26 @@ def get_data_table(url, output_file, most_rows=True):
1649
1663
  target_table=find_table_with_most_rows(tables)[0] # (1, 32)
1650
1664
  df=tables[target_table]
1651
1665
 
1666
+ df.to_excel(output_file, index=False)
1667
+ print(f"Data has been saved to {output_file}")
1668
+ except Exception as err:
1669
+ print(f"Errors found! {err}")
1670
+ return None
1671
+
1672
+ def get_data_table_html_string(html_string, output_file, most_rows=True):
1673
+ try:
1674
+ # Wrap the HTML string in a StringIO object
1675
+ from io import StringIO
1676
+ html_io = StringIO(html_string)
1677
+ tables=pd.read_html(html_io)
1678
+ if most_rows==False:
1679
+ # 1. default: the first table
1680
+ df=tables[0]
1681
+ else:
1682
+ # 2. get the table with most rows
1683
+ target_table=find_table_with_most_rows(tables)[0] # (1, 32)
1684
+ df=tables[target_table]
1685
+
1652
1686
  df.to_excel(output_file, index=False)
1653
1687
  print(f"Data has been saved to {output_file}")
1654
1688
  except Exception as err:
PgsFile/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
1
  # 1. Web scraping
2
2
  from .PgsFile import PGScraper
3
3
  from .PgsFile import audiovisual_downloader
4
+ from .PgsFile import headers, encode_chinese_keyword_for_url
4
5
 
5
6
  # 2. Package/library management
6
7
  from .PgsFile import install_package, uninstall_package
@@ -9,7 +10,8 @@ from .PgsFile import run_script, run_command
9
10
  # 3. Text data retrieval
10
11
  from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
11
12
  from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
12
- from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
13
+ from .PgsFile import get_data_html_online, get_data_html_offline
14
+ from .PgsFile import get_data_table_url, get_data_table_html_string
13
15
 
14
16
  # 4. Text data storage
15
17
  from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PgsFile
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
5
- Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
5
+ Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
6
6
  Author: Pan Guisheng
7
7
  Author-email: 895284504@qq.com
8
8
  License: Educational free
@@ -17,6 +17,7 @@ Requires-Dist: pandas
17
17
  Requires-Dist: python-docx
18
18
  Requires-Dist: pip
19
19
  Requires-Dist: requests
20
+ Requires-Dist: fake-useragent
20
21
  Requires-Dist: lxml
21
22
  Requires-Dist: pimht
22
23
  Requires-Dist: pysbd
@@ -1,5 +1,5 @@
1
- PgsFile/PgsFile.py,sha256=6CXBDn3VC4gUkigNVCkM9eVPOe4Xyww32tG0ZDeYNfI,78446
2
- PgsFile/__init__.py,sha256=TaKrLI0pGAFm_2Bzjf_cGnog_URzaAgHRW5myzY0Lz8,2144
1
+ PgsFile/PgsFile.py,sha256=Tm7-TkW4dpRtLGRR06t6jjsga8MJelu6BeirtvHURxc,79720
2
+ PgsFile/__init__.py,sha256=PJ8pJVly_6qNe4zEWp5Q5kLdy0rNcyilM-bbBemxhl4,2259
3
3
  PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
4
4
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
5
5
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
2618
2618
  PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
2619
2619
  PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
2620
2620
  PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
2621
- PgsFile-0.1.7.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
- PgsFile-0.1.7.dist-info/METADATA,sha256=0HAA5A68yHiB-LVlNuF-pkKo_lawzwTU-Thf-i2FiUY,4924
2623
- PgsFile-0.1.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
- PgsFile-0.1.7.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
- PgsFile-0.1.7.dist-info/RECORD,,
2621
+ PgsFile-0.1.9.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
+ PgsFile-0.1.9.dist-info/METADATA,sha256=JAqaoghX_comHPuhW5pb3UskF65jKoJv9RTR0FYZbEA,4983
2623
+ PgsFile-0.1.9.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
+ PgsFile-0.1.9.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
+ PgsFile-0.1.9.dist-info/RECORD,,