PgsFile 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +24 -99
- PgsFile/__init__.py +5 -5
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.7.dist-info}/METADATA +2 -1
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.7.dist-info}/RECORD +7 -7
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.7.dist-info}/LICENSE +0 -0
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.7.dist-info}/WHEEL +0 -0
- {PgsFile-0.1.6.dist-info → PgsFile-0.1.7.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -358,6 +358,20 @@ def write_to_json_lines(json_path,my_json_data):
|
|
|
358
358
|
file.write(json_str + '\n')
|
|
359
359
|
file.close()
|
|
360
360
|
|
|
361
|
+
|
|
362
|
+
# Function to append a dictionary to a JSON file
|
|
363
|
+
def append_dict_to_json(file_path, data_dict):
|
|
364
|
+
try:
|
|
365
|
+
import json
|
|
366
|
+
with open(file_path, 'a', encoding="utf-8") as file:
|
|
367
|
+
json_string = json.dumps(data_dict, ensure_ascii=False)
|
|
368
|
+
file.write(json_string + '\n')
|
|
369
|
+
# print(f"Dictionary appended to {file_path}")
|
|
370
|
+
except IOError as e:
|
|
371
|
+
print(f"An I/O error occurred: {e}")
|
|
372
|
+
except Exception as e:
|
|
373
|
+
print(f"An error occurred: {e}")
|
|
374
|
+
|
|
361
375
|
def FilePath(root):
|
|
362
376
|
'''读取所有文件,列出每个文件的路径'''
|
|
363
377
|
import os
|
|
@@ -553,19 +567,6 @@ def next_folder_names(folder):
|
|
|
553
567
|
folder_namelist=next(os.walk(folder))[1]
|
|
554
568
|
return folder_namelist
|
|
555
569
|
|
|
556
|
-
def get_package_path(package_name):
|
|
557
|
-
import site
|
|
558
|
-
import os
|
|
559
|
-
package_paths=site.getsitepackages()
|
|
560
|
-
package_path=None
|
|
561
|
-
for path in package_paths:
|
|
562
|
-
if os.path.exists(os.path.join(path, package_name)):
|
|
563
|
-
package_path=os.path.join(path, package_name)
|
|
564
|
-
break
|
|
565
|
-
|
|
566
|
-
if package_path is None:
|
|
567
|
-
raise ModuleNotFoundError(f"Package '{package_name}' not found.")
|
|
568
|
-
return package_path
|
|
569
570
|
|
|
570
571
|
def remove_empty_txts(folder_path):
|
|
571
572
|
import os
|
|
@@ -631,77 +632,6 @@ def remove_empty_last_line(folder_path):
|
|
|
631
632
|
f2.write(lines[i])
|
|
632
633
|
f2.close()
|
|
633
634
|
print(end_empty_files,str(len(end_empty_files))+" files found with last line empty!")
|
|
634
|
-
|
|
635
|
-
corpus_root=get_package_path('PgsFile')+"/Corpora"
|
|
636
|
-
def extract_stopwords(lang=None):
|
|
637
|
-
'''
|
|
638
|
-
Parameters
|
|
639
|
-
----------
|
|
640
|
-
lang : TYPE, optional string
|
|
641
|
-
DESCRIPTION. The default is None.
|
|
642
|
-
lang="english"; lang="chinese" etc.
|
|
643
|
-
|
|
644
|
-
Returns
|
|
645
|
-
-------
|
|
646
|
-
contents : TYPE list
|
|
647
|
-
DESCRIPTION. ["'ll", "'tis", "'twas", "'ve", '10', '39', 'a', "a's", 'able', 'ableabout', 'about', 'above', 'abroad', 'abst', 'accordance', 'according']
|
|
648
|
-
|
|
649
|
-
'''
|
|
650
|
-
import os
|
|
651
|
-
# Check if the folder exists
|
|
652
|
-
if not os.path.isdir(corpus_root):
|
|
653
|
-
print(f"Error: The folder '{corpus_root}' does not exist.")
|
|
654
|
-
return None
|
|
655
|
-
|
|
656
|
-
if lang is None:
|
|
657
|
-
language="english"
|
|
658
|
-
else:
|
|
659
|
-
language=lang
|
|
660
|
-
file_name=language+".txt"
|
|
661
|
-
|
|
662
|
-
# Traverse the folder recursively
|
|
663
|
-
for root, dirs, files in os.walk(corpus_root):
|
|
664
|
-
# Check if the text file exists in the current folder
|
|
665
|
-
if file_name in files:
|
|
666
|
-
# Construct the full path to the text file
|
|
667
|
-
file_path=os.path.join(root, file_name)
|
|
668
|
-
# Read the contents of the text file
|
|
669
|
-
contents=[line.strip() for line in get_data_lines(file_path)]
|
|
670
|
-
return contents
|
|
671
|
-
|
|
672
|
-
# If the text file doesn't exist in any folder, print an error message
|
|
673
|
-
print(f"Error: The file '{file_name}' does not exist in the folder '{corpus_root}' or its sub-folders.")
|
|
674
|
-
return None
|
|
675
|
-
|
|
676
|
-
pickle_root=get_package_path('PgsFile')+"/models"
|
|
677
|
-
def load_pickle_data(lang=None):
|
|
678
|
-
'''
|
|
679
|
-
Parameters
|
|
680
|
-
----------
|
|
681
|
-
lang : TYPE, optional
|
|
682
|
-
DESCRIPTION. The default is None.
|
|
683
|
-
lang="english"; lang="chinese" etc.
|
|
684
|
-
Returns
|
|
685
|
-
-------
|
|
686
|
-
data : TYPE
|
|
687
|
-
DESCRIPTION.
|
|
688
|
-
|
|
689
|
-
'''
|
|
690
|
-
import pickle
|
|
691
|
-
files=FilePath(pickle_root)
|
|
692
|
-
if lang is None:
|
|
693
|
-
language="english"
|
|
694
|
-
else:
|
|
695
|
-
language=lang
|
|
696
|
-
file_path=""
|
|
697
|
-
for file in files:
|
|
698
|
-
if language in FileName(file):
|
|
699
|
-
file_path=file
|
|
700
|
-
with open(file_path, 'rb') as handle:
|
|
701
|
-
data=pickle.load(handle)
|
|
702
|
-
return data
|
|
703
|
-
|
|
704
|
-
|
|
705
635
|
|
|
706
636
|
def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
|
|
707
637
|
"""
|
|
@@ -731,8 +661,14 @@ def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
|
|
|
731
661
|
|
|
732
662
|
# Standard sentence tokenizer.
|
|
733
663
|
def sent_tokenize(text, lang=None):
|
|
734
|
-
|
|
735
|
-
|
|
664
|
+
import pysbd
|
|
665
|
+
if lang is None:
|
|
666
|
+
lang="en"
|
|
667
|
+
else:
|
|
668
|
+
lang=lang
|
|
669
|
+
seg = pysbd.Segmenter(language=lang, clean=False)
|
|
670
|
+
sent_list = seg.segment(text)
|
|
671
|
+
return sent_list
|
|
736
672
|
|
|
737
673
|
def cs(para):
|
|
738
674
|
"""
|
|
@@ -757,18 +693,7 @@ def cs(para):
|
|
|
757
693
|
return paras
|
|
758
694
|
|
|
759
695
|
|
|
760
|
-
def cs1(
|
|
761
|
-
"""
|
|
762
|
-
#英文分句
|
|
763
|
-
using nltk model
|
|
764
|
-
---------
|
|
765
|
-
Returns
|
|
766
|
-
list
|
|
767
|
-
"""
|
|
768
|
-
return sent_tokenize(para)
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
def cs2(text):
|
|
696
|
+
def cs1(text):
|
|
772
697
|
"""
|
|
773
698
|
#英文分句
|
|
774
699
|
using regular expression
|
|
@@ -811,7 +736,7 @@ def cs2(text):
|
|
|
811
736
|
sentences=sentences[:-1]
|
|
812
737
|
sentences=[s.strip() for s in sentences]
|
|
813
738
|
if len(sentences)==0:
|
|
814
|
-
sentences=
|
|
739
|
+
sentences=sent_tokenize(text)
|
|
815
740
|
else:
|
|
816
741
|
sentences=sentences
|
|
817
742
|
return sentences
|
PgsFile/__init__.py
CHANGED
|
@@ -12,18 +12,18 @@ from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspe
|
|
|
12
12
|
from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
|
|
13
13
|
|
|
14
14
|
# 4. Text data storage
|
|
15
|
-
from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, save_dict_to_excel
|
|
15
|
+
from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
|
|
16
16
|
|
|
17
17
|
# 5. File/folder process
|
|
18
18
|
from .PgsFile import FilePath, FileName, DirList
|
|
19
|
-
from .PgsFile import get_subfolder_path
|
|
19
|
+
from .PgsFile import get_subfolder_path
|
|
20
20
|
from .PgsFile import makedirec, makefile
|
|
21
|
-
from .PgsFile import source_path, next_folder_names,
|
|
21
|
+
from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
|
|
22
22
|
from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
|
|
23
23
|
|
|
24
24
|
# 6. Data cleaning
|
|
25
25
|
from .PgsFile import BigPunctuation, StopTags, Special, yhd
|
|
26
|
-
from .PgsFile import ZhStopWords, EnPunctuation
|
|
26
|
+
from .PgsFile import ZhStopWords, EnPunctuation
|
|
27
27
|
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
|
|
28
28
|
from .PgsFile import check_contain_chinese, check_contain_number
|
|
29
29
|
from .PgsFile import replace_chinese_punctuation_with_english
|
|
@@ -35,7 +35,7 @@ from .PgsFile import extract_chinese_punctuation, generate_password, sort_string
|
|
|
35
35
|
from .PgsFile import strQ2B_raw, strQ2B_words
|
|
36
36
|
from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
|
|
37
37
|
from .PgsFile import word_list, batch_word_list
|
|
38
|
-
from .PgsFile import cs, cs1,
|
|
38
|
+
from .PgsFile import cs, cs1, sent_tokenize
|
|
39
39
|
|
|
40
40
|
# 8. Maths
|
|
41
41
|
from .PgsFile import len_rows, check_empty_cells
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PgsFile
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
|
|
5
5
|
Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
|
|
6
6
|
Author: Pan Guisheng
|
|
@@ -19,6 +19,7 @@ Requires-Dist: pip
|
|
|
19
19
|
Requires-Dist: requests
|
|
20
20
|
Requires-Dist: lxml
|
|
21
21
|
Requires-Dist: pimht
|
|
22
|
+
Requires-Dist: pysbd
|
|
22
23
|
|
|
23
24
|
Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
|
|
24
25
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256=
|
|
1
|
+
PgsFile/PgsFile.py,sha256=6CXBDn3VC4gUkigNVCkM9eVPOe4Xyww32tG0ZDeYNfI,78446
|
|
2
|
+
PgsFile/__init__.py,sha256=TaKrLI0pGAFm_2Bzjf_cGnog_URzaAgHRW5myzY0Lz8,2144
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
|
|
|
2618
2618
|
PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
|
|
2619
2619
|
PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
|
|
2620
2620
|
PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
|
|
2621
|
-
PgsFile-0.1.
|
|
2622
|
-
PgsFile-0.1.
|
|
2623
|
-
PgsFile-0.1.
|
|
2624
|
-
PgsFile-0.1.
|
|
2625
|
-
PgsFile-0.1.
|
|
2621
|
+
PgsFile-0.1.7.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2622
|
+
PgsFile-0.1.7.dist-info/METADATA,sha256=0HAA5A68yHiB-LVlNuF-pkKo_lawzwTU-Thf-i2FiUY,4924
|
|
2623
|
+
PgsFile-0.1.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2624
|
+
PgsFile-0.1.7.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2625
|
+
PgsFile-0.1.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|