PyPI - PgsFile - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

PgsFile 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of PgsFile might be problematic. Click here for more details.

Files changed (7) hide show

PgsFile/PgsFile.py CHANGED Viewed

@@ -358,6 +358,20 @@ def write_to_json_lines(json_path,my_json_data):
             file.write(json_str + '\n')
     file.close()
+# Function to append a dictionary to a JSON file
+def append_dict_to_json(file_path, data_dict):
+    try:
+        import json
+        with open(file_path, 'a', encoding="utf-8") as file:
+            json_string = json.dumps(data_dict, ensure_ascii=False)
+            file.write(json_string + '\n')
+        # print(f"Dictionary appended to {file_path}")
+    except IOError as e:
+        print(f"An I/O error occurred: {e}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
 def FilePath(root):
     '''读取所有文件，列出每个文件的路径'''
     import os
@@ -553,19 +567,6 @@ def next_folder_names(folder):
     folder_namelist=next(os.walk(folder))[1]
     return folder_namelist
-def get_package_path(package_name):
-    import site
-    import os
-    package_paths=site.getsitepackages()
-    package_path=None
-    for path in package_paths:
-        if os.path.exists(os.path.join(path, package_name)):
-            package_path=os.path.join(path, package_name)
-            break
-    if package_path is None:
-        raise ModuleNotFoundError(f"Package '{package_name}' not found.")
-    return package_path
 def remove_empty_txts(folder_path):
     import os
@@ -631,77 +632,6 @@ def remove_empty_last_line(folder_path):
                 f2.write(lines[i])
         f2.close()
     print(end_empty_files,str(len(end_empty_files))+" files found with last line empty!")
-corpus_root=get_package_path('PgsFile')+"/Corpora"
-def extract_stopwords(lang=None):
-    '''
-    Parameters
-    ----------
-    lang : TYPE, optional string
-        DESCRIPTION. The default is None.
-        lang="english"; lang="chinese" etc.
-    Returns
-    -------
-    contents : TYPE list
-        DESCRIPTION. ["'ll", "'tis", "'twas", "'ve", '10', '39', 'a', "a's", 'able', 'ableabout', 'about', 'above', 'abroad', 'abst', 'accordance', 'according']
-    '''
-    import os
-    # Check if the folder exists
-    if not os.path.isdir(corpus_root):
-        print(f"Error: The folder '{corpus_root}' does not exist.")
-        return None
-    if lang is None:
-         language="english"
-    else:
-         language=lang
-    file_name=language+".txt"
-    # Traverse the folder recursively
-    for root, dirs, files in os.walk(corpus_root):
-        # Check if the text file exists in the current folder
-        if file_name in files:
-            # Construct the full path to the text file
-            file_path=os.path.join(root, file_name)
-            # Read the contents of the text file
-            contents=[line.strip() for line in get_data_lines(file_path)]
-            return contents
-    # If the text file doesn't exist in any folder, print an error message
-    print(f"Error: The file '{file_name}' does not exist in the folder '{corpus_root}' or its sub-folders.")
-    return None
-pickle_root=get_package_path('PgsFile')+"/models"
-def load_pickle_data(lang=None):
-    '''
-    Parameters
-    ----------
-    lang : TYPE, optional
-        DESCRIPTION. The default is None.
-        lang="english"; lang="chinese" etc.
-    Returns
-    -------
-    data : TYPE
-        DESCRIPTION.
-    '''
-    import pickle
-    files=FilePath(pickle_root)
-    if lang is None:
-         language="english"
-    else:
-         language=lang
-    file_path=""
-    for file in files:
-         if language in FileName(file):
-              file_path=file
-    with open(file_path, 'rb') as handle:
-        data=pickle.load(handle)
-    return data
 def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
     """
@@ -731,8 +661,14 @@ def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
 # Standard sentence tokenizer.
 def sent_tokenize(text, lang=None):
-    tokenizer=load_pickle_data(lang)
-    return tokenizer.tokenize(text)
+    import pysbd
+    if lang is None:
+        lang="en"
+    else:
+        lang=lang
+    seg = pysbd.Segmenter(language=lang, clean=False)
+    sent_list = seg.segment(text)
+    return sent_list
 def cs(para):
     """
@@ -757,18 +693,7 @@ def cs(para):
     return paras
-def cs1(para): #英文分句
-    """
-    #英文分句
-    using nltk model
-    ---------
-    Returns
-    list
-    """
-    return sent_tokenize(para)
-def cs2(text):
+def cs1(text):
     """
     #英文分句
     using regular expression
@@ -811,7 +736,7 @@ def cs2(text):
     sentences=sentences[:-1]
     sentences=[s.strip() for s in sentences]
     if len(sentences)==0:
-        sentences=cs1(text)
+        sentences=sent_tokenize(text)
     else:
         sentences=sentences
     return sentences
@@ -1246,6 +1171,11 @@ import requests
 from lxml import html, etree
 import pandas as pd
 my_headers={"User-Agent": random.choice(yhd)}
+from fake_useragent import UserAgent
+ua = UserAgent()
+headers = {"User-Agent": ua.random}
 class PGScraper(object):
     def __init__(self):
         self.pattern=[]
@@ -1409,8 +1339,8 @@ Showing download progress and speed when audio-visual files like MP4, MP3, JPG e
 import time
 from contextlib import closing
-def audiovisual_downloader(url, path):
-    with closing(requests.get(url, stream=True, headers=my_headers)) as r:
+def audiovisual_downloader(url, path, headers=None):
+    with closing(requests.get(url, stream=True, headers=headers)) as r:
         chunk_size=1024*10
         content_size=int(r.headers['content-length'])
         print('Initiating download...')
@@ -1697,7 +1627,7 @@ def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=Non
                 r.encoding="utf-8"
                 data=r.text
                 html=etree.HTML(data)
-                return html
+                return html, data
         else:
             print(r.status_code, "Can not find the page!")
             return None
@@ -1713,8 +1643,9 @@ def find_table_with_most_rows(tables):
             max_table_index=i
     return max_table_index, max_rows if max_table_index!= -1 else None
-def get_data_table(url, output_file, most_rows=True):
+def get_data_table_url(url, output_file, most_rows=True):
     try:
+        # Wrap the HTML string in a StringIO object
         tables=pd.read_html(url)
         if most_rows==False:
             # 1. default: the first table
@@ -1724,6 +1655,26 @@ def get_data_table(url, output_file, most_rows=True):
             target_table=find_table_with_most_rows(tables)[0] #  (1, 32)
             df=tables[target_table]
+        df.to_excel(output_file, index=False)
+        print(f"Data has been saved to {output_file}")
+    except Exception as err:
+        print(f"Errors found! {err}")
+        return None
+def get_data_table_html_string(html_string, output_file, most_rows=True):
+    try:
+        # Wrap the HTML string in a StringIO object
+        from io import StringIO
+        html_io = StringIO(html_string)
+        tables=pd.read_html(html_io)
+        if most_rows==False:
+            # 1. default: the first table
+            df=tables[0]
+        else:
+            # 2. get the table with most rows
+            target_table=find_table_with_most_rows(tables)[0] #  (1, 32)
+            df=tables[target_table]
         df.to_excel(output_file, index=False)
         print(f"Data has been saved to {output_file}")
     except Exception as err:

PgsFile/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # 1. Web scraping
 from .PgsFile import PGScraper
 from .PgsFile import audiovisual_downloader
+from .PgsFile import headers
 # 2. Package/library management
 from .PgsFile import install_package, uninstall_package
@@ -9,21 +10,22 @@ from .PgsFile import run_script, run_command
 # 3. Text data retrieval
 from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
 from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
-from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
+from .PgsFile import get_data_html_online, get_data_html_offline
+from .PgsFile import get_data_table_url, get_data_table_html_string
 # 4. Text data storage
-from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, save_dict_to_excel
+from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
 # 5. File/folder process
 from .PgsFile import FilePath, FileName, DirList
-from .PgsFile import get_subfolder_path, get_package_path
+from .PgsFile import get_subfolder_path
 from .PgsFile import makedirec, makefile
-from .PgsFile import source_path, next_folder_names, corpus_root, get_directory_tree_with_meta, find_txt_files_with_keyword
+from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
 from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
 # 6. Data cleaning
 from .PgsFile import BigPunctuation, StopTags, Special, yhd
-from .PgsFile import ZhStopWords, EnPunctuation, extract_stopwords
+from .PgsFile import ZhStopWords, EnPunctuation
 from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
 from .PgsFile import check_contain_chinese, check_contain_number
 from .PgsFile import replace_chinese_punctuation_with_english
@@ -35,7 +37,7 @@ from .PgsFile import extract_chinese_punctuation, generate_password, sort_string
 from .PgsFile import strQ2B_raw, strQ2B_words
 from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
 from .PgsFile import word_list, batch_word_list
-from .PgsFile import cs, cs1, cs2
+from .PgsFile import cs, cs1, sent_tokenize
 # 8. Maths
 from .PgsFile import len_rows, check_empty_cells

{PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: PgsFile
-Version: 0.1.6
+Version: 0.1.8
 Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
 Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
 Author: Pan Guisheng
@@ -17,8 +17,10 @@ Requires-Dist: pandas
 Requires-Dist: python-docx
 Requires-Dist: pip
 Requires-Dist: requests
+Requires-Dist: fake-useragent
 Requires-Dist: lxml
 Requires-Dist: pimht
+Requires-Dist: pysbd
 Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.

{PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-PgsFile/PgsFile.py,sha256=jmSiczDE5cV47tHpCGDwLn19C90NGQtQ2vEn4ys4NUg,80514
-PgsFile/__init__.py,sha256=EKhIRd2tktjyrvBlBPgQsIJTqU7DdLIobNG8gEiZ--0,2163
+PgsFile/PgsFile.py,sha256=70VLJybFPPbaxPxb5z_vnbEZG9fVgkO7nxcxI77auUI,79354
+PgsFile/__init__.py,sha256=vJqj1rfxK-2UrS4m3nCg0yglqsF_8P-DPAnDgxTdAj8,2227
 PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
 PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
 PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
 PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
 PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
 PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
-PgsFile-0.1.6.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
-PgsFile-0.1.6.dist-info/METADATA,sha256=T0mBPq7PnljEcGjLItIJ3RIcZk7veOuy0vVgLuo31lo,4902
-PgsFile-0.1.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-PgsFile-0.1.6.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
-PgsFile-0.1.6.dist-info/RECORD,,
+PgsFile-0.1.8.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
+PgsFile-0.1.8.dist-info/METADATA,sha256=oSsVQL9ZZBLhk83DN22SrtrSh3r_V8D7ILYfHv1SJbo,4955
+PgsFile-0.1.8.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
+PgsFile-0.1.8.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
+PgsFile-0.1.8.dist-info/RECORD,,

{PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/LICENSE RENAMED Viewed

File without changes

{PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

PgsFile 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

Potentially problematic release.

PgsFile 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl