PgsFile 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

PgsFile/PgsFile.py CHANGED
@@ -1171,6 +1171,11 @@ import requests
1171
1171
  from lxml import html, etree
1172
1172
  import pandas as pd
1173
1173
  my_headers={"User-Agent": random.choice(yhd)}
1174
+
1175
+ from fake_useragent import UserAgent
1176
+ ua = UserAgent()
1177
+ headers = {"User-Agent": ua.random}
1178
+
1174
1179
  class PGScraper(object):
1175
1180
  def __init__(self):
1176
1181
  self.pattern=[]
@@ -1334,8 +1339,8 @@ Showing download progress and speed when audio-visual files like MP4, MP3, JPG e
1334
1339
  import time
1335
1340
  from contextlib import closing
1336
1341
 
1337
- def audiovisual_downloader(url, path):
1338
- with closing(requests.get(url, stream=True, headers=my_headers)) as r:
1342
+ def audiovisual_downloader(url, path, headers=None):
1343
+ with closing(requests.get(url, stream=True, headers=headers)) as r:
1339
1344
  chunk_size=1024*10
1340
1345
  content_size=int(r.headers['content-length'])
1341
1346
  print('Initiating download...')
@@ -1622,7 +1627,7 @@ def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=Non
1622
1627
  r.encoding="utf-8"
1623
1628
  data=r.text
1624
1629
  html=etree.HTML(data)
1625
- return html
1630
+ return html, data
1626
1631
  else:
1627
1632
  print(r.status_code, "Can not find the page!")
1628
1633
  return None
@@ -1638,8 +1643,9 @@ def find_table_with_most_rows(tables):
1638
1643
  max_table_index=i
1639
1644
  return max_table_index, max_rows if max_table_index!= -1 else None
1640
1645
 
1641
- def get_data_table(url, output_file, most_rows=True):
1646
+ def get_data_table_url(url, output_file, most_rows=True):
1642
1647
  try:
1648
+ # Wrap the HTML string in a StringIO object
1643
1649
  tables=pd.read_html(url)
1644
1650
  if most_rows==False:
1645
1651
  # 1. default: the first table
@@ -1649,6 +1655,26 @@ def get_data_table(url, output_file, most_rows=True):
1649
1655
  target_table=find_table_with_most_rows(tables)[0] # (1, 32)
1650
1656
  df=tables[target_table]
1651
1657
 
1658
+ df.to_excel(output_file, index=False)
1659
+ print(f"Data has been saved to {output_file}")
1660
+ except Exception as err:
1661
+ print(f"Errors found! {err}")
1662
+ return None
1663
+
1664
+ def get_data_table_html_string(html_string, output_file, most_rows=True):
1665
+ try:
1666
+ # Wrap the HTML string in a StringIO object
1667
+ from io import StringIO
1668
+ html_io = StringIO(html_string)
1669
+ tables=pd.read_html(html_io)
1670
+ if most_rows==False:
1671
+ # 1. default: the first table
1672
+ df=tables[0]
1673
+ else:
1674
+ # 2. get the table with most rows
1675
+ target_table=find_table_with_most_rows(tables)[0] # (1, 32)
1676
+ df=tables[target_table]
1677
+
1652
1678
  df.to_excel(output_file, index=False)
1653
1679
  print(f"Data has been saved to {output_file}")
1654
1680
  except Exception as err:
PgsFile/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
1
  # 1. Web scraping
2
2
  from .PgsFile import PGScraper
3
3
  from .PgsFile import audiovisual_downloader
4
+ from .PgsFile import headers
4
5
 
5
6
  # 2. Package/library management
6
7
  from .PgsFile import install_package, uninstall_package
@@ -9,7 +10,8 @@ from .PgsFile import run_script, run_command
9
10
  # 3. Text data retrieval
10
11
  from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
11
12
  from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
12
- from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
13
+ from .PgsFile import get_data_html_online, get_data_html_offline
14
+ from .PgsFile import get_data_table_url, get_data_table_html_string
13
15
 
14
16
  # 4. Text data storage
15
17
  from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PgsFile
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
5
5
  Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
6
6
  Author: Pan Guisheng
@@ -17,6 +17,7 @@ Requires-Dist: pandas
17
17
  Requires-Dist: python-docx
18
18
  Requires-Dist: pip
19
19
  Requires-Dist: requests
20
+ Requires-Dist: fake-useragent
20
21
  Requires-Dist: lxml
21
22
  Requires-Dist: pimht
22
23
  Requires-Dist: pysbd
@@ -1,5 +1,5 @@
1
- PgsFile/PgsFile.py,sha256=6CXBDn3VC4gUkigNVCkM9eVPOe4Xyww32tG0ZDeYNfI,78446
2
- PgsFile/__init__.py,sha256=TaKrLI0pGAFm_2Bzjf_cGnog_URzaAgHRW5myzY0Lz8,2144
1
+ PgsFile/PgsFile.py,sha256=70VLJybFPPbaxPxb5z_vnbEZG9fVgkO7nxcxI77auUI,79354
2
+ PgsFile/__init__.py,sha256=vJqj1rfxK-2UrS4m3nCg0yglqsF_8P-DPAnDgxTdAj8,2227
3
3
  PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
4
4
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
5
5
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
2618
2618
  PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
2619
2619
  PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
2620
2620
  PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
2621
- PgsFile-0.1.7.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
- PgsFile-0.1.7.dist-info/METADATA,sha256=0HAA5A68yHiB-LVlNuF-pkKo_lawzwTU-Thf-i2FiUY,4924
2623
- PgsFile-0.1.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
- PgsFile-0.1.7.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
- PgsFile-0.1.7.dist-info/RECORD,,
2621
+ PgsFile-0.1.8.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
+ PgsFile-0.1.8.dist-info/METADATA,sha256=oSsVQL9ZZBLhk83DN22SrtrSh3r_V8D7ILYfHv1SJbo,4955
2623
+ PgsFile-0.1.8.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
+ PgsFile-0.1.8.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
+ PgsFile-0.1.8.dist-info/RECORD,,