PyPI - PgsFile - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

PgsFile 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of PgsFile might be problematic. Click here for more details.

Files changed (7) hide show

PgsFile/PgsFile.py CHANGED Viewed

@@ -1171,6 +1171,11 @@ import requests
 from lxml import html, etree
 import pandas as pd
 my_headers={"User-Agent": random.choice(yhd)}
+from fake_useragent import UserAgent
+ua = UserAgent()
+headers = {"User-Agent": ua.random}
 class PGScraper(object):
     def __init__(self):
         self.pattern=[]
@@ -1334,8 +1339,8 @@ Showing download progress and speed when audio-visual files like MP4, MP3, JPG e
 import time
 from contextlib import closing
-def audiovisual_downloader(url, path):
-    with closing(requests.get(url, stream=True, headers=my_headers)) as r:
+def audiovisual_downloader(url, path, headers=None):
+    with closing(requests.get(url, stream=True, headers=headers)) as r:
         chunk_size=1024*10
         content_size=int(r.headers['content-length'])
         print('Initiating download...')
@@ -1622,7 +1627,7 @@ def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=Non
                 r.encoding="utf-8"
                 data=r.text
                 html=etree.HTML(data)
-                return html
+                return html, data
         else:
             print(r.status_code, "Can not find the page!")
             return None
@@ -1638,8 +1643,9 @@ def find_table_with_most_rows(tables):
             max_table_index=i
     return max_table_index, max_rows if max_table_index!= -1 else None
-def get_data_table(url, output_file, most_rows=True):
+def get_data_table_url(url, output_file, most_rows=True):
     try:
+        # Wrap the HTML string in a StringIO object
         tables=pd.read_html(url)
         if most_rows==False:
             # 1. default: the first table
@@ -1649,6 +1655,26 @@ def get_data_table(url, output_file, most_rows=True):
             target_table=find_table_with_most_rows(tables)[0] #  (1, 32)
             df=tables[target_table]
+        df.to_excel(output_file, index=False)
+        print(f"Data has been saved to {output_file}")
+    except Exception as err:
+        print(f"Errors found! {err}")
+        return None
+def get_data_table_html_string(html_string, output_file, most_rows=True):
+    try:
+        # Wrap the HTML string in a StringIO object
+        from io import StringIO
+        html_io = StringIO(html_string)
+        tables=pd.read_html(html_io)
+        if most_rows==False:
+            # 1. default: the first table
+            df=tables[0]
+        else:
+            # 2. get the table with most rows
+            target_table=find_table_with_most_rows(tables)[0] #  (1, 32)
+            df=tables[target_table]
         df.to_excel(output_file, index=False)
         print(f"Data has been saved to {output_file}")
     except Exception as err:

PgsFile/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # 1. Web scraping
 from .PgsFile import PGScraper
 from .PgsFile import audiovisual_downloader
+from .PgsFile import headers
 # 2. Package/library management
 from .PgsFile import install_package, uninstall_package
@@ -9,7 +10,8 @@ from .PgsFile import run_script, run_command
 # 3. Text data retrieval
 from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
 from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
-from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
+from .PgsFile import get_data_html_online, get_data_html_offline
+from .PgsFile import get_data_table_url, get_data_table_html_string
 # 4. Text data storage
 from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel

{PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: PgsFile
-Version: 0.1.7
+Version: 0.1.8
 Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
 Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
 Author: Pan Guisheng
@@ -17,6 +17,7 @@ Requires-Dist: pandas
 Requires-Dist: python-docx
 Requires-Dist: pip
 Requires-Dist: requests
+Requires-Dist: fake-useragent
 Requires-Dist: lxml
 Requires-Dist: pimht
 Requires-Dist: pysbd

{PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-PgsFile/PgsFile.py,sha256=6CXBDn3VC4gUkigNVCkM9eVPOe4Xyww32tG0ZDeYNfI,78446
-PgsFile/__init__.py,sha256=TaKrLI0pGAFm_2Bzjf_cGnog_URzaAgHRW5myzY0Lz8,2144
+PgsFile/PgsFile.py,sha256=70VLJybFPPbaxPxb5z_vnbEZG9fVgkO7nxcxI77auUI,79354
+PgsFile/__init__.py,sha256=vJqj1rfxK-2UrS4m3nCg0yglqsF_8P-DPAnDgxTdAj8,2227
 PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
 PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
 PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
 PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
 PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
 PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
-PgsFile-0.1.7.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
-PgsFile-0.1.7.dist-info/METADATA,sha256=0HAA5A68yHiB-LVlNuF-pkKo_lawzwTU-Thf-i2FiUY,4924
-PgsFile-0.1.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-PgsFile-0.1.7.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
-PgsFile-0.1.7.dist-info/RECORD,,
+PgsFile-0.1.8.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
+PgsFile-0.1.8.dist-info/METADATA,sha256=oSsVQL9ZZBLhk83DN22SrtrSh3r_V8D7ILYfHv1SJbo,4955
+PgsFile-0.1.8.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
+PgsFile-0.1.8.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
+PgsFile-0.1.8.dist-info/RECORD,,

{PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/LICENSE RENAMED Viewed

File without changes

{PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{PgsFile-0.1.7.dist-info → PgsFile-0.1.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

PgsFile 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

Potentially problematic release.

PgsFile 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl