PgsFile 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +54 -103
- PgsFile/__init__.py +8 -6
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/METADATA +3 -1
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/RECORD +7 -7
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/LICENSE +0 -0
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/WHEEL +0 -0
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.8.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -358,6 +358,20 @@ def write_to_json_lines(json_path,my_json_data):
|
|
|
358
358
|
file.write(json_str + '\n')
|
|
359
359
|
file.close()
|
|
360
360
|
|
|
361
|
+
|
|
362
|
+
# Function to append a dictionary to a JSON file
|
|
363
|
+
def append_dict_to_json(file_path, data_dict):
|
|
364
|
+
try:
|
|
365
|
+
import json
|
|
366
|
+
with open(file_path, 'a', encoding="utf-8") as file:
|
|
367
|
+
json_string = json.dumps(data_dict, ensure_ascii=False)
|
|
368
|
+
file.write(json_string + '\n')
|
|
369
|
+
# print(f"Dictionary appended to {file_path}")
|
|
370
|
+
except IOError as e:
|
|
371
|
+
print(f"An I/O error occurred: {e}")
|
|
372
|
+
except Exception as e:
|
|
373
|
+
print(f"An error occurred: {e}")
|
|
374
|
+
|
|
361
375
|
def FilePath(root):
|
|
362
376
|
'''读取所有文件,列出每个文件的路径'''
|
|
363
377
|
import os
|
|
@@ -553,19 +567,6 @@ def next_folder_names(folder):
|
|
|
553
567
|
folder_namelist=next(os.walk(folder))[1]
|
|
554
568
|
return folder_namelist
|
|
555
569
|
|
|
556
|
-
def get_package_path(package_name):
|
|
557
|
-
import site
|
|
558
|
-
import os
|
|
559
|
-
package_paths=site.getsitepackages()
|
|
560
|
-
package_path=None
|
|
561
|
-
for path in package_paths:
|
|
562
|
-
if os.path.exists(os.path.join(path, package_name)):
|
|
563
|
-
package_path=os.path.join(path, package_name)
|
|
564
|
-
break
|
|
565
|
-
|
|
566
|
-
if package_path is None:
|
|
567
|
-
raise ModuleNotFoundError(f"Package '{package_name}' not found.")
|
|
568
|
-
return package_path
|
|
569
570
|
|
|
570
571
|
def remove_empty_txts(folder_path):
|
|
571
572
|
import os
|
|
@@ -631,77 +632,6 @@ def remove_empty_last_line(folder_path):
|
|
|
631
632
|
f2.write(lines[i])
|
|
632
633
|
f2.close()
|
|
633
634
|
print(end_empty_files,str(len(end_empty_files))+" files found with last line empty!")
|
|
634
|
-
|
|
635
|
-
corpus_root=get_package_path('PgsFile')+"/Corpora"
|
|
636
|
-
def extract_stopwords(lang=None):
|
|
637
|
-
'''
|
|
638
|
-
Parameters
|
|
639
|
-
----------
|
|
640
|
-
lang : TYPE, optional string
|
|
641
|
-
DESCRIPTION. The default is None.
|
|
642
|
-
lang="english"; lang="chinese" etc.
|
|
643
|
-
|
|
644
|
-
Returns
|
|
645
|
-
-------
|
|
646
|
-
contents : TYPE list
|
|
647
|
-
DESCRIPTION. ["'ll", "'tis", "'twas", "'ve", '10', '39', 'a', "a's", 'able', 'ableabout', 'about', 'above', 'abroad', 'abst', 'accordance', 'according']
|
|
648
|
-
|
|
649
|
-
'''
|
|
650
|
-
import os
|
|
651
|
-
# Check if the folder exists
|
|
652
|
-
if not os.path.isdir(corpus_root):
|
|
653
|
-
print(f"Error: The folder '{corpus_root}' does not exist.")
|
|
654
|
-
return None
|
|
655
|
-
|
|
656
|
-
if lang is None:
|
|
657
|
-
language="english"
|
|
658
|
-
else:
|
|
659
|
-
language=lang
|
|
660
|
-
file_name=language+".txt"
|
|
661
|
-
|
|
662
|
-
# Traverse the folder recursively
|
|
663
|
-
for root, dirs, files in os.walk(corpus_root):
|
|
664
|
-
# Check if the text file exists in the current folder
|
|
665
|
-
if file_name in files:
|
|
666
|
-
# Construct the full path to the text file
|
|
667
|
-
file_path=os.path.join(root, file_name)
|
|
668
|
-
# Read the contents of the text file
|
|
669
|
-
contents=[line.strip() for line in get_data_lines(file_path)]
|
|
670
|
-
return contents
|
|
671
|
-
|
|
672
|
-
# If the text file doesn't exist in any folder, print an error message
|
|
673
|
-
print(f"Error: The file '{file_name}' does not exist in the folder '{corpus_root}' or its sub-folders.")
|
|
674
|
-
return None
|
|
675
|
-
|
|
676
|
-
pickle_root=get_package_path('PgsFile')+"/models"
|
|
677
|
-
def load_pickle_data(lang=None):
|
|
678
|
-
'''
|
|
679
|
-
Parameters
|
|
680
|
-
----------
|
|
681
|
-
lang : TYPE, optional
|
|
682
|
-
DESCRIPTION. The default is None.
|
|
683
|
-
lang="english"; lang="chinese" etc.
|
|
684
|
-
Returns
|
|
685
|
-
-------
|
|
686
|
-
data : TYPE
|
|
687
|
-
DESCRIPTION.
|
|
688
|
-
|
|
689
|
-
'''
|
|
690
|
-
import pickle
|
|
691
|
-
files=FilePath(pickle_root)
|
|
692
|
-
if lang is None:
|
|
693
|
-
language="english"
|
|
694
|
-
else:
|
|
695
|
-
language=lang
|
|
696
|
-
file_path=""
|
|
697
|
-
for file in files:
|
|
698
|
-
if language in FileName(file):
|
|
699
|
-
file_path=file
|
|
700
|
-
with open(file_path, 'rb') as handle:
|
|
701
|
-
data=pickle.load(handle)
|
|
702
|
-
return data
|
|
703
|
-
|
|
704
|
-
|
|
705
635
|
|
|
706
636
|
def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
|
|
707
637
|
"""
|
|
@@ -731,8 +661,14 @@ def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
|
|
|
731
661
|
|
|
732
662
|
# Standard sentence tokenizer.
|
|
733
663
|
def sent_tokenize(text, lang=None):
|
|
734
|
-
|
|
735
|
-
|
|
664
|
+
import pysbd
|
|
665
|
+
if lang is None:
|
|
666
|
+
lang="en"
|
|
667
|
+
else:
|
|
668
|
+
lang=lang
|
|
669
|
+
seg = pysbd.Segmenter(language=lang, clean=False)
|
|
670
|
+
sent_list = seg.segment(text)
|
|
671
|
+
return sent_list
|
|
736
672
|
|
|
737
673
|
def cs(para):
|
|
738
674
|
"""
|
|
@@ -757,18 +693,7 @@ def cs(para):
|
|
|
757
693
|
return paras
|
|
758
694
|
|
|
759
695
|
|
|
760
|
-
def cs1(
|
|
761
|
-
"""
|
|
762
|
-
#英文分句
|
|
763
|
-
using nltk model
|
|
764
|
-
---------
|
|
765
|
-
Returns
|
|
766
|
-
list
|
|
767
|
-
"""
|
|
768
|
-
return sent_tokenize(para)
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
def cs2(text):
|
|
696
|
+
def cs1(text):
|
|
772
697
|
"""
|
|
773
698
|
#英文分句
|
|
774
699
|
using regular expression
|
|
@@ -811,7 +736,7 @@ def cs2(text):
|
|
|
811
736
|
sentences=sentences[:-1]
|
|
812
737
|
sentences=[s.strip() for s in sentences]
|
|
813
738
|
if len(sentences)==0:
|
|
814
|
-
sentences=
|
|
739
|
+
sentences=sent_tokenize(text)
|
|
815
740
|
else:
|
|
816
741
|
sentences=sentences
|
|
817
742
|
return sentences
|
|
@@ -1246,6 +1171,11 @@ import requests
|
|
|
1246
1171
|
from lxml import html, etree
|
|
1247
1172
|
import pandas as pd
|
|
1248
1173
|
my_headers={"User-Agent": random.choice(yhd)}
|
|
1174
|
+
|
|
1175
|
+
from fake_useragent import UserAgent
|
|
1176
|
+
ua = UserAgent()
|
|
1177
|
+
headers = {"User-Agent": ua.random}
|
|
1178
|
+
|
|
1249
1179
|
class PGScraper(object):
|
|
1250
1180
|
def __init__(self):
|
|
1251
1181
|
self.pattern=[]
|
|
@@ -1409,8 +1339,8 @@ Showing download progress and speed when audio-visual files like MP4, MP3, JPG e
|
|
|
1409
1339
|
import time
|
|
1410
1340
|
from contextlib import closing
|
|
1411
1341
|
|
|
1412
|
-
def audiovisual_downloader(url, path):
|
|
1413
|
-
with closing(requests.get(url, stream=True, headers=
|
|
1342
|
+
def audiovisual_downloader(url, path, headers=None):
|
|
1343
|
+
with closing(requests.get(url, stream=True, headers=headers)) as r:
|
|
1414
1344
|
chunk_size=1024*10
|
|
1415
1345
|
content_size=int(r.headers['content-length'])
|
|
1416
1346
|
print('Initiating download...')
|
|
@@ -1697,7 +1627,7 @@ def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=Non
|
|
|
1697
1627
|
r.encoding="utf-8"
|
|
1698
1628
|
data=r.text
|
|
1699
1629
|
html=etree.HTML(data)
|
|
1700
|
-
return html
|
|
1630
|
+
return html, data
|
|
1701
1631
|
else:
|
|
1702
1632
|
print(r.status_code, "Can not find the page!")
|
|
1703
1633
|
return None
|
|
@@ -1713,8 +1643,9 @@ def find_table_with_most_rows(tables):
|
|
|
1713
1643
|
max_table_index=i
|
|
1714
1644
|
return max_table_index, max_rows if max_table_index!= -1 else None
|
|
1715
1645
|
|
|
1716
|
-
def
|
|
1646
|
+
def get_data_table_url(url, output_file, most_rows=True):
|
|
1717
1647
|
try:
|
|
1648
|
+
# Wrap the HTML string in a StringIO object
|
|
1718
1649
|
tables=pd.read_html(url)
|
|
1719
1650
|
if most_rows==False:
|
|
1720
1651
|
# 1. default: the first table
|
|
@@ -1724,6 +1655,26 @@ def get_data_table(url, output_file, most_rows=True):
|
|
|
1724
1655
|
target_table=find_table_with_most_rows(tables)[0] # (1, 32)
|
|
1725
1656
|
df=tables[target_table]
|
|
1726
1657
|
|
|
1658
|
+
df.to_excel(output_file, index=False)
|
|
1659
|
+
print(f"Data has been saved to {output_file}")
|
|
1660
|
+
except Exception as err:
|
|
1661
|
+
print(f"Errors found! {err}")
|
|
1662
|
+
return None
|
|
1663
|
+
|
|
1664
|
+
def get_data_table_html_string(html_string, output_file, most_rows=True):
|
|
1665
|
+
try:
|
|
1666
|
+
# Wrap the HTML string in a StringIO object
|
|
1667
|
+
from io import StringIO
|
|
1668
|
+
html_io = StringIO(html_string)
|
|
1669
|
+
tables=pd.read_html(html_io)
|
|
1670
|
+
if most_rows==False:
|
|
1671
|
+
# 1. default: the first table
|
|
1672
|
+
df=tables[0]
|
|
1673
|
+
else:
|
|
1674
|
+
# 2. get the table with most rows
|
|
1675
|
+
target_table=find_table_with_most_rows(tables)[0] # (1, 32)
|
|
1676
|
+
df=tables[target_table]
|
|
1677
|
+
|
|
1727
1678
|
df.to_excel(output_file, index=False)
|
|
1728
1679
|
print(f"Data has been saved to {output_file}")
|
|
1729
1680
|
except Exception as err:
|
PgsFile/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# 1. Web scraping
|
|
2
2
|
from .PgsFile import PGScraper
|
|
3
3
|
from .PgsFile import audiovisual_downloader
|
|
4
|
+
from .PgsFile import headers
|
|
4
5
|
|
|
5
6
|
# 2. Package/library management
|
|
6
7
|
from .PgsFile import install_package, uninstall_package
|
|
@@ -9,21 +10,22 @@ from .PgsFile import run_script, run_command
|
|
|
9
10
|
# 3. Text data retrieval
|
|
10
11
|
from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
|
|
11
12
|
from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
|
|
12
|
-
from .PgsFile import get_data_html_online, get_data_html_offline
|
|
13
|
+
from .PgsFile import get_data_html_online, get_data_html_offline
|
|
14
|
+
from .PgsFile import get_data_table_url, get_data_table_html_string
|
|
13
15
|
|
|
14
16
|
# 4. Text data storage
|
|
15
|
-
from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, save_dict_to_excel
|
|
17
|
+
from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
|
|
16
18
|
|
|
17
19
|
# 5. File/folder process
|
|
18
20
|
from .PgsFile import FilePath, FileName, DirList
|
|
19
|
-
from .PgsFile import get_subfolder_path
|
|
21
|
+
from .PgsFile import get_subfolder_path
|
|
20
22
|
from .PgsFile import makedirec, makefile
|
|
21
|
-
from .PgsFile import source_path, next_folder_names,
|
|
23
|
+
from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
|
|
22
24
|
from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
|
|
23
25
|
|
|
24
26
|
# 6. Data cleaning
|
|
25
27
|
from .PgsFile import BigPunctuation, StopTags, Special, yhd
|
|
26
|
-
from .PgsFile import ZhStopWords, EnPunctuation
|
|
28
|
+
from .PgsFile import ZhStopWords, EnPunctuation
|
|
27
29
|
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
|
|
28
30
|
from .PgsFile import check_contain_chinese, check_contain_number
|
|
29
31
|
from .PgsFile import replace_chinese_punctuation_with_english
|
|
@@ -35,7 +37,7 @@ from .PgsFile import extract_chinese_punctuation, generate_password, sort_string
|
|
|
35
37
|
from .PgsFile import strQ2B_raw, strQ2B_words
|
|
36
38
|
from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
|
|
37
39
|
from .PgsFile import word_list, batch_word_list
|
|
38
|
-
from .PgsFile import cs, cs1,
|
|
40
|
+
from .PgsFile import cs, cs1, sent_tokenize
|
|
39
41
|
|
|
40
42
|
# 8. Maths
|
|
41
43
|
from .PgsFile import len_rows, check_empty_cells
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PgsFile
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
|
|
5
5
|
Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
|
|
6
6
|
Author: Pan Guisheng
|
|
@@ -17,8 +17,10 @@ Requires-Dist: pandas
|
|
|
17
17
|
Requires-Dist: python-docx
|
|
18
18
|
Requires-Dist: pip
|
|
19
19
|
Requires-Dist: requests
|
|
20
|
+
Requires-Dist: fake-useragent
|
|
20
21
|
Requires-Dist: lxml
|
|
21
22
|
Requires-Dist: pimht
|
|
23
|
+
Requires-Dist: pysbd
|
|
22
24
|
|
|
23
25
|
Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
|
|
24
26
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256=
|
|
1
|
+
PgsFile/PgsFile.py,sha256=70VLJybFPPbaxPxb5z_vnbEZG9fVgkO7nxcxI77auUI,79354
|
|
2
|
+
PgsFile/__init__.py,sha256=vJqj1rfxK-2UrS4m3nCg0yglqsF_8P-DPAnDgxTdAj8,2227
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
|
|
|
2618
2618
|
PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
|
|
2619
2619
|
PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
|
|
2620
2620
|
PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
|
|
2621
|
-
PgsFile-0.1.
|
|
2622
|
-
PgsFile-0.1.
|
|
2623
|
-
PgsFile-0.1.
|
|
2624
|
-
PgsFile-0.1.
|
|
2625
|
-
PgsFile-0.1.
|
|
2621
|
+
PgsFile-0.1.8.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2622
|
+
PgsFile-0.1.8.dist-info/METADATA,sha256=oSsVQL9ZZBLhk83DN22SrtrSh3r_V8D7ILYfHv1SJbo,4955
|
|
2623
|
+
PgsFile-0.1.8.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2624
|
+
PgsFile-0.1.8.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2625
|
+
PgsFile-0.1.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|