PgsFile 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/Corpora/Stopwords/NLPIR.user +0 -0
- PgsFile/PgsFile.py +465 -34
- PgsFile/__init__.py +10 -4
- PgsFile/models/NLPIR.user +0 -0
- PgsFile/models/fonts/DejaVuSans.ttf +0 -0
- PgsFile/models/fonts//321/204/342/225/243/320/266/321/204/342/225/234/320/243/321/205/320/255/320/232/321/210/342/225/241/342/225/241/321/204/342/225/243/320/255/321/206/342/226/222/320/257/321/211/320/242/320/262/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/204/342/225/234/320/243.ttf +0 -0
- PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/206/320/265/342/225/226/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
- PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
- PgsFile/models/fonts//321/205/320/235/320/252/321/206/342/224/244/320/233/321/210/320/261/320/234/321/204/342/225/243/320/2663500.TTF +0 -0
- PgsFile/models/fonts//321/211/320/251/320/226/321/206/320/257/320/274/321/204/342/225/243/320/233/321/210/320/261/320/234/321/204/342/225/243/320/266/321/205/320/275/320/247/321/204/342/225/234/320/243.ttf +0 -0
- PgsFile/models/model_reviews2.2.bin +0 -0
- PgsFile/models/model_reviews_ReadMe.txt +134 -0
- PgsFile-0.2.4.dist-info/METADATA +41 -0
- {PgsFile-0.2.2.dist-info → PgsFile-0.2.4.dist-info}/RECORD +17 -7
- PgsFile-0.2.2.dist-info/METADATA +0 -79
- {PgsFile-0.2.2.dist-info → PgsFile-0.2.4.dist-info}/LICENSE +0 -0
- {PgsFile-0.2.2.dist-info → PgsFile-0.2.4.dist-info}/WHEEL +0 -0
- {PgsFile-0.2.2.dist-info → PgsFile-0.2.4.dist-info}/top_level.txt +0 -0
|
Binary file
|
PgsFile/PgsFile.py
CHANGED
|
@@ -103,7 +103,7 @@ def get_data_text(path):
|
|
|
103
103
|
else:
|
|
104
104
|
return None
|
|
105
105
|
|
|
106
|
-
def get_data_lines(path):
|
|
106
|
+
def get_data_lines(path, no_line_breaks=False):
|
|
107
107
|
'''
|
|
108
108
|
Parameters
|
|
109
109
|
----------
|
|
@@ -133,7 +133,10 @@ def get_data_lines(path):
|
|
|
133
133
|
# Read the entire file using the detected encoding
|
|
134
134
|
if encoding:
|
|
135
135
|
with open(path, 'r', encoding=encoding, errors="ignore") as f:
|
|
136
|
-
|
|
136
|
+
if no_line_breaks is False:
|
|
137
|
+
lines = [l.strip() for l in f.readlines() if len(l.strip()) != 0]
|
|
138
|
+
else:
|
|
139
|
+
lines = f.readlines()
|
|
137
140
|
return lines
|
|
138
141
|
else:
|
|
139
142
|
return None
|
|
@@ -197,15 +200,15 @@ def get_data_excel(excel_path,column_id,sheet_name=None):
|
|
|
197
200
|
inter=df.iloc[0:,column_id] #提取第二列所有行
|
|
198
201
|
return list(inter)
|
|
199
202
|
|
|
200
|
-
def write_to_excel(excel_path,
|
|
203
|
+
def write_to_excel(excel_path, data, sheet_name=None, index=None):
|
|
201
204
|
'''
|
|
202
205
|
Parameters
|
|
203
206
|
----------
|
|
204
207
|
excel_path : TYPE
|
|
205
208
|
DESCRIPTION. results.xlsx
|
|
206
209
|
|
|
207
|
-
|
|
208
|
-
DESCRIPTION. {
|
|
210
|
+
data : TYPE, dict
|
|
211
|
+
DESCRIPTION. data = {'翻译': 24, '教学': 8, '数智': 6, '时代': 6, '财经': 6, '新': 4}
|
|
209
212
|
|
|
210
213
|
sheet_name : TYPE, optional
|
|
211
214
|
DESCRIPTION. The default is None.
|
|
@@ -227,6 +230,10 @@ def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
|
|
|
227
230
|
index=False
|
|
228
231
|
else:
|
|
229
232
|
index=True
|
|
233
|
+
|
|
234
|
+
col = list(data.keys())
|
|
235
|
+
freq = list(data.values())
|
|
236
|
+
dic_of_list={"items": col, "counts": freq}
|
|
230
237
|
|
|
231
238
|
df=pd.DataFrame(dic_of_list)
|
|
232
239
|
df.style.to_excel(excel_path, sheet_name=sheet_name,startcol=0, index=index)
|
|
@@ -471,6 +478,18 @@ def get_directory_tree_with_meta(start_path, indent='', show_meta=False, max_dir
|
|
|
471
478
|
print(f"{indent}└── ... (and {remaining_directories} more directories)")
|
|
472
479
|
# current_level=-1 will show all folders' info.
|
|
473
480
|
|
|
481
|
+
def get_full_path(*path_components):
|
|
482
|
+
"""
|
|
483
|
+
Combines multiple path components into a single, full path using os.path.join.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
*path_components: Variable number of path components (strings).
|
|
487
|
+
|
|
488
|
+
Returns:
|
|
489
|
+
str: The combined full path.
|
|
490
|
+
"""
|
|
491
|
+
return os.path.join(*path_components)
|
|
492
|
+
|
|
474
493
|
def get_subfolder_path(parent_folder, subfolder_name):
|
|
475
494
|
import os
|
|
476
495
|
subfolder_name=subfolder_name.strip()
|
|
@@ -553,7 +572,6 @@ def batch_word_list(input_root):
|
|
|
553
572
|
sorted_words=sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
|
|
554
573
|
return sorted_words
|
|
555
574
|
|
|
556
|
-
|
|
557
575
|
def clean_list(meta):
|
|
558
576
|
"""
|
|
559
577
|
Parameters
|
|
@@ -576,7 +594,6 @@ def clean_list(meta):
|
|
|
576
594
|
|
|
577
595
|
yhd=["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)','Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0','Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)','Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1','Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13','Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6','NOKIA5700/ UCWEB7.0.2.37/28/999','Openwave/ UCWEB7.0.2.37/28/999','Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10','Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11','Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11','UCWEB7.0.2.37/28/999']
|
|
578
596
|
|
|
579
|
-
|
|
580
597
|
def source_path(relative_path):
|
|
581
598
|
import sys,os
|
|
582
599
|
if getattr(sys, 'frozen', False):
|
|
@@ -590,7 +607,6 @@ def next_folder_names(folder):
|
|
|
590
607
|
folder_namelist=next(os.walk(folder))[1]
|
|
591
608
|
return folder_namelist
|
|
592
609
|
|
|
593
|
-
|
|
594
610
|
def remove_empty_txts(folder_path):
|
|
595
611
|
import os
|
|
596
612
|
files=FilePath(folder_path)
|
|
@@ -700,6 +716,21 @@ def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
|
|
|
700
716
|
matches.append(os.path.join(root, filename))
|
|
701
717
|
return matches
|
|
702
718
|
|
|
719
|
+
import fnmatch
|
|
720
|
+
def find_user_files_in_upper_folder(directory, user_file_name):
|
|
721
|
+
# Get the direct upper folder path
|
|
722
|
+
upper_folder = os.path.dirname(os.path.abspath(directory))
|
|
723
|
+
|
|
724
|
+
# List to store matching file paths
|
|
725
|
+
matching_files = []
|
|
726
|
+
|
|
727
|
+
# Walk through the upper folder
|
|
728
|
+
for root, dirs, files in os.walk(upper_folder):
|
|
729
|
+
for filename in fnmatch.filter(files, f'{user_file_name}.user'):
|
|
730
|
+
matching_files.append(os.path.join(root, filename))
|
|
731
|
+
|
|
732
|
+
return matching_files
|
|
733
|
+
|
|
703
734
|
# Standard sentence tokenizer.
|
|
704
735
|
def sent_tokenize(text, lang=None):
|
|
705
736
|
import pysbd
|
|
@@ -816,12 +847,314 @@ def word_tokenize(text, pos_tagged=False):
|
|
|
816
847
|
try:
|
|
817
848
|
from nlpir import tools
|
|
818
849
|
tools.update_license()
|
|
850
|
+
print("\n\nThe user file is ready. Please restart your kernel and run the Python script!")
|
|
819
851
|
except Exception as err2:
|
|
820
|
-
print("
|
|
852
|
+
print("\n*****SOLUTION WARNING! \nYOU MAY NEED A VPN TO TRY THIS SERVICE!*****\n\n", err2)
|
|
821
853
|
else:
|
|
822
|
-
|
|
854
|
+
try:
|
|
855
|
+
if "Can not open" in str(err):
|
|
856
|
+
user_folder=get_library_location("PgsFile")+"/PgsFile/models"
|
|
857
|
+
destination_folder=get_library_location("nlpir-python")+"/nlpir/Data"
|
|
858
|
+
source_file=find_user_files_in_upper_folder(user_folder, "NLPIR")[0]
|
|
859
|
+
copy_file(source_file, destination_folder)
|
|
860
|
+
print("The user file is ready. Please restart your kernel and run the Python script!")
|
|
861
|
+
else:
|
|
862
|
+
print(err)
|
|
863
|
+
except Exception as rer:
|
|
864
|
+
print(rer)
|
|
865
|
+
|
|
823
866
|
return words
|
|
824
867
|
|
|
868
|
+
import re
|
|
869
|
+
from abc import ABC, abstractmethod
|
|
870
|
+
from typing import Iterator, List, Tuple
|
|
871
|
+
class TokenizerI(ABC):
|
|
872
|
+
"""
|
|
873
|
+
A processing interface for tokenizing a string.
|
|
874
|
+
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
|
|
875
|
+
"""
|
|
876
|
+
|
|
877
|
+
@abstractmethod
|
|
878
|
+
def tokenize(self, s: str) -> List[str]:
|
|
879
|
+
"""
|
|
880
|
+
Return a tokenized copy of *s*.
|
|
881
|
+
|
|
882
|
+
:rtype: List[str]
|
|
883
|
+
"""
|
|
884
|
+
if overridden(self.tokenize_sents):
|
|
885
|
+
return self.tokenize_sents([s])[0]
|
|
886
|
+
|
|
887
|
+
def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
|
|
888
|
+
"""
|
|
889
|
+
Identify the tokens using integer offsets ``(start_i, end_i)``,
|
|
890
|
+
where ``s[start_i:end_i]`` is the corresponding token.
|
|
891
|
+
|
|
892
|
+
:rtype: Iterator[Tuple[int, int]]
|
|
893
|
+
"""
|
|
894
|
+
raise NotImplementedError()
|
|
895
|
+
|
|
896
|
+
def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
|
|
897
|
+
"""
|
|
898
|
+
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
|
|
899
|
+
|
|
900
|
+
return [self.tokenize(s) for s in strings]
|
|
901
|
+
|
|
902
|
+
:rtype: List[List[str]]
|
|
903
|
+
"""
|
|
904
|
+
return [self.tokenize(s) for s in strings]
|
|
905
|
+
|
|
906
|
+
def span_tokenize_sents(
|
|
907
|
+
self, strings: List[str]
|
|
908
|
+
) -> Iterator[List[Tuple[int, int]]]:
|
|
909
|
+
"""
|
|
910
|
+
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
|
|
911
|
+
|
|
912
|
+
return [self.span_tokenize(s) for s in strings]
|
|
913
|
+
|
|
914
|
+
:yield: List[Tuple[int, int]]
|
|
915
|
+
"""
|
|
916
|
+
for s in strings:
|
|
917
|
+
yield list(self.span_tokenize(s))
|
|
918
|
+
|
|
919
|
+
class MacIntyreContractions:
|
|
920
|
+
"""
|
|
921
|
+
List of contractions adapted from Robert MacIntyre's tokenizer.
|
|
922
|
+
"""
|
|
923
|
+
|
|
924
|
+
CONTRACTIONS2 = [
|
|
925
|
+
r"(?i)\b(can)(?#X)(not)\b",
|
|
926
|
+
r"(?i)\b(d)(?#X)('ye)\b",
|
|
927
|
+
r"(?i)\b(gim)(?#X)(me)\b",
|
|
928
|
+
r"(?i)\b(gon)(?#X)(na)\b",
|
|
929
|
+
r"(?i)\b(got)(?#X)(ta)\b",
|
|
930
|
+
r"(?i)\b(lem)(?#X)(me)\b",
|
|
931
|
+
r"(?i)\b(more)(?#X)('n)\b",
|
|
932
|
+
r"(?i)\b(wan)(?#X)(na)(?=\s)",
|
|
933
|
+
]
|
|
934
|
+
CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
|
|
935
|
+
CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
|
|
936
|
+
|
|
937
|
+
class NLTKWordTokenizer(TokenizerI):
|
|
938
|
+
"""
|
|
939
|
+
The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
|
|
940
|
+
|
|
941
|
+
This is the method that is invoked by ``word_tokenize()``. It assumes that the
|
|
942
|
+
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
|
|
943
|
+
|
|
944
|
+
The tokenizer is "destructive" such that the regexes applied will munge the
|
|
945
|
+
input string to a state beyond re-construction. It is possible to apply
|
|
946
|
+
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of
|
|
947
|
+
`NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
|
|
948
|
+
revert to the original string.
|
|
949
|
+
"""
|
|
950
|
+
|
|
951
|
+
# Starting quotes.
|
|
952
|
+
STARTING_QUOTES = [
|
|
953
|
+
(re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
|
|
954
|
+
(re.compile(r"^\""), r"``"),
|
|
955
|
+
(re.compile(r"(``)"), r" \1 "),
|
|
956
|
+
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
|
|
957
|
+
(re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
|
|
958
|
+
]
|
|
959
|
+
|
|
960
|
+
# Ending quotes.
|
|
961
|
+
ENDING_QUOTES = [
|
|
962
|
+
(re.compile("([»”’])", re.U), r" \1 "),
|
|
963
|
+
(re.compile(r"''"), " '' "),
|
|
964
|
+
(re.compile(r'"'), " '' "),
|
|
965
|
+
(re.compile(r"\s+"), " "),
|
|
966
|
+
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
|
|
967
|
+
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
|
|
968
|
+
]
|
|
969
|
+
|
|
970
|
+
# For improvements for starting/closing quotes from TreebankWordTokenizer,
|
|
971
|
+
# see discussion on https://github.com/nltk/nltk/pull/1437
|
|
972
|
+
# Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
|
|
973
|
+
# - chevron quotes u'\xab' and u'\xbb'
|
|
974
|
+
# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
|
|
975
|
+
# See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
|
|
976
|
+
# Also, behavior of splitting on clitics now follows Stanford CoreNLP
|
|
977
|
+
# - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
|
|
978
|
+
|
|
979
|
+
# Punctuation.
|
|
980
|
+
PUNCTUATION = [
|
|
981
|
+
(re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
|
|
982
|
+
(re.compile(r"([:,])([^\d])"), r" \1 \2"),
|
|
983
|
+
(re.compile(r"([:,])$"), r" \1 "),
|
|
984
|
+
(
|
|
985
|
+
re.compile(r"\.{2,}", re.U),
|
|
986
|
+
r" \g<0> ",
|
|
987
|
+
), # See https://github.com/nltk/nltk/pull/2322
|
|
988
|
+
(re.compile(r"[;@#$%&]"), r" \g<0> "),
|
|
989
|
+
(
|
|
990
|
+
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
|
|
991
|
+
r"\1 \2\3 ",
|
|
992
|
+
), # Handles the final period.
|
|
993
|
+
(re.compile(r"[?!]"), r" \g<0> "),
|
|
994
|
+
(re.compile(r"([^'])' "), r"\1 ' "),
|
|
995
|
+
(
|
|
996
|
+
re.compile(r"[*]", re.U),
|
|
997
|
+
r" \g<0> ",
|
|
998
|
+
), # See https://github.com/nltk/nltk/pull/2322
|
|
999
|
+
]
|
|
1000
|
+
|
|
1001
|
+
# Pads parentheses
|
|
1002
|
+
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
|
|
1003
|
+
|
|
1004
|
+
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
|
|
1005
|
+
CONVERT_PARENTHESES = [
|
|
1006
|
+
(re.compile(r"\("), "-LRB-"),
|
|
1007
|
+
(re.compile(r"\)"), "-RRB-"),
|
|
1008
|
+
(re.compile(r"\["), "-LSB-"),
|
|
1009
|
+
(re.compile(r"\]"), "-RSB-"),
|
|
1010
|
+
(re.compile(r"\{"), "-LCB-"),
|
|
1011
|
+
(re.compile(r"\}"), "-RCB-"),
|
|
1012
|
+
]
|
|
1013
|
+
|
|
1014
|
+
DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
|
|
1015
|
+
|
|
1016
|
+
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
|
1017
|
+
_contractions = MacIntyreContractions()
|
|
1018
|
+
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
|
|
1019
|
+
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
|
|
1020
|
+
|
|
1021
|
+
def tokenize(
|
|
1022
|
+
self, text: str, convert_parentheses: bool = False, return_str: bool = False
|
|
1023
|
+
) -> List[str]:
|
|
1024
|
+
r"""Return a tokenized copy of `text`.
|
|
1025
|
+
|
|
1026
|
+
>>> from nltk.tokenize import NLTKWordTokenizer
|
|
1027
|
+
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
|
1028
|
+
>>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
|
1029
|
+
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
|
|
1030
|
+
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
|
1031
|
+
'of', 'them.', 'Thanks', '.']
|
|
1032
|
+
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
|
|
1033
|
+
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
|
|
1034
|
+
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
|
1035
|
+
'of', 'them.', 'Thanks', '.']
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
:param text: A string with a sentence or sentences.
|
|
1039
|
+
:type text: str
|
|
1040
|
+
:param convert_parentheses: if True, replace parentheses to PTB symbols,
|
|
1041
|
+
e.g. `(` to `-LRB-`. Defaults to False.
|
|
1042
|
+
:type convert_parentheses: bool, optional
|
|
1043
|
+
:param return_str: If True, return tokens as space-separated string,
|
|
1044
|
+
defaults to False.
|
|
1045
|
+
:type return_str: bool, optional
|
|
1046
|
+
:return: List of tokens from `text`.
|
|
1047
|
+
:rtype: List[str]
|
|
1048
|
+
"""
|
|
1049
|
+
if return_str:
|
|
1050
|
+
warnings.warn(
|
|
1051
|
+
"Parameter 'return_str' has been deprecated and should no "
|
|
1052
|
+
"longer be used.",
|
|
1053
|
+
category=DeprecationWarning,
|
|
1054
|
+
stacklevel=2,
|
|
1055
|
+
)
|
|
1056
|
+
|
|
1057
|
+
for regexp, substitution in self.STARTING_QUOTES:
|
|
1058
|
+
text = regexp.sub(substitution, text)
|
|
1059
|
+
|
|
1060
|
+
for regexp, substitution in self.PUNCTUATION:
|
|
1061
|
+
text = regexp.sub(substitution, text)
|
|
1062
|
+
|
|
1063
|
+
# Handles parentheses.
|
|
1064
|
+
regexp, substitution = self.PARENS_BRACKETS
|
|
1065
|
+
text = regexp.sub(substitution, text)
|
|
1066
|
+
# Optionally convert parentheses
|
|
1067
|
+
if convert_parentheses:
|
|
1068
|
+
for regexp, substitution in self.CONVERT_PARENTHESES:
|
|
1069
|
+
text = regexp.sub(substitution, text)
|
|
1070
|
+
|
|
1071
|
+
# Handles double dash.
|
|
1072
|
+
regexp, substitution = self.DOUBLE_DASHES
|
|
1073
|
+
text = regexp.sub(substitution, text)
|
|
1074
|
+
|
|
1075
|
+
# add extra space to make things easier
|
|
1076
|
+
text = " " + text + " "
|
|
1077
|
+
|
|
1078
|
+
for regexp, substitution in self.ENDING_QUOTES:
|
|
1079
|
+
text = regexp.sub(substitution, text)
|
|
1080
|
+
|
|
1081
|
+
for regexp in self.CONTRACTIONS2:
|
|
1082
|
+
text = regexp.sub(r" \1 \2 ", text)
|
|
1083
|
+
for regexp in self.CONTRACTIONS3:
|
|
1084
|
+
text = regexp.sub(r" \1 \2 ", text)
|
|
1085
|
+
|
|
1086
|
+
# We are not using CONTRACTIONS4 since
|
|
1087
|
+
# they are also commented out in the SED scripts
|
|
1088
|
+
# for regexp in self._contractions.CONTRACTIONS4:
|
|
1089
|
+
# text = regexp.sub(r' \1 \2 \3 ', text)
|
|
1090
|
+
|
|
1091
|
+
return text.split()
|
|
1092
|
+
|
|
1093
|
+
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
|
|
1094
|
+
r"""
|
|
1095
|
+
Returns the spans of the tokens in ``text``.
|
|
1096
|
+
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
|
|
1097
|
+
|
|
1098
|
+
>>> from nltk.tokenize import NLTKWordTokenizer
|
|
1099
|
+
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
|
1100
|
+
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
|
|
1101
|
+
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
|
|
1102
|
+
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
|
|
1103
|
+
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
|
|
1104
|
+
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
|
|
1105
|
+
True
|
|
1106
|
+
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
|
1107
|
+
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
|
|
1108
|
+
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
|
|
1109
|
+
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
|
|
1110
|
+
True
|
|
1111
|
+
|
|
1112
|
+
:param text: A string with a sentence or sentences.
|
|
1113
|
+
:type text: str
|
|
1114
|
+
:yield: Tuple[int, int]
|
|
1115
|
+
"""
|
|
1116
|
+
raw_tokens = self.tokenize(text)
|
|
1117
|
+
|
|
1118
|
+
# Convert converted quotes back to original double quotes
|
|
1119
|
+
# Do this only if original text contains double quote(s) or double
|
|
1120
|
+
# single-quotes (because '' might be transformed to `` if it is
|
|
1121
|
+
# treated as starting quotes).
|
|
1122
|
+
if ('"' in text) or ("''" in text):
|
|
1123
|
+
# Find double quotes and converted quotes
|
|
1124
|
+
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
|
|
1125
|
+
|
|
1126
|
+
# Replace converted quotes back to double quotes
|
|
1127
|
+
tokens = [
|
|
1128
|
+
matched.pop(0) if tok in ['"', "``", "''"] else tok
|
|
1129
|
+
for tok in raw_tokens
|
|
1130
|
+
]
|
|
1131
|
+
else:
|
|
1132
|
+
tokens = raw_tokens
|
|
1133
|
+
|
|
1134
|
+
yield from align_tokens(tokens, text)
|
|
1135
|
+
|
|
1136
|
+
# Standard word tokenizer.
|
|
1137
|
+
_treebank_word_tokenizer = NLTKWordTokenizer()
|
|
1138
|
+
def word_tokenize2(text, preserve_line=False):
|
|
1139
|
+
"""
|
|
1140
|
+
Return a tokenized copy of *text*,
|
|
1141
|
+
using NLTK's recommended word tokenizer
|
|
1142
|
+
(currently an improved :class:`.TreebankWordTokenizer`
|
|
1143
|
+
along with :class:`.PunktSentenceTokenizer`
|
|
1144
|
+
for the specified language).
|
|
1145
|
+
|
|
1146
|
+
:param text: text to split into words
|
|
1147
|
+
:type text: str
|
|
1148
|
+
:param language: the model name in the Punkt corpus
|
|
1149
|
+
:type language: str
|
|
1150
|
+
:param preserve_line: A flag to decide whether to sentence tokenize the text or not.
|
|
1151
|
+
:type preserve_line: bool
|
|
1152
|
+
"""
|
|
1153
|
+
sentences = [text] if preserve_line else sent_tokenize(text)
|
|
1154
|
+
return [
|
|
1155
|
+
token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
|
|
1156
|
+
]
|
|
1157
|
+
|
|
825
1158
|
def pad_sequence(
|
|
826
1159
|
sequence,
|
|
827
1160
|
n,
|
|
@@ -861,9 +1194,7 @@ def pad_sequence(
|
|
|
861
1194
|
sequence=chain(sequence, (right_pad_symbol,) * (n - 1))
|
|
862
1195
|
return sequence
|
|
863
1196
|
|
|
864
|
-
|
|
865
1197
|
# add a flag to pad the sequence so we get peripheral ngrams?
|
|
866
|
-
|
|
867
1198
|
def ngrams(
|
|
868
1199
|
sequence,
|
|
869
1200
|
n,
|
|
@@ -926,7 +1257,6 @@ def ngrams(
|
|
|
926
1257
|
yield tuple(history)
|
|
927
1258
|
del history[0]
|
|
928
1259
|
|
|
929
|
-
|
|
930
1260
|
def bigrams(sequence, **kwargs):
|
|
931
1261
|
"""
|
|
932
1262
|
Return the bigrams generated from a sequence of items, as an iterator.
|
|
@@ -946,7 +1276,6 @@ def bigrams(sequence, **kwargs):
|
|
|
946
1276
|
for item in ngrams(sequence, 2, **kwargs):
|
|
947
1277
|
yield item
|
|
948
1278
|
|
|
949
|
-
|
|
950
1279
|
def trigrams(sequence, **kwargs):
|
|
951
1280
|
"""
|
|
952
1281
|
Return the trigrams generated from a sequence of items, as an iterator.
|
|
@@ -966,7 +1295,6 @@ def trigrams(sequence, **kwargs):
|
|
|
966
1295
|
for item in ngrams(sequence, 3, **kwargs):
|
|
967
1296
|
yield item
|
|
968
1297
|
|
|
969
|
-
|
|
970
1298
|
def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
|
|
971
1299
|
"""
|
|
972
1300
|
Returns all possible ngrams generated from a sequence of items, as an iterator.
|
|
@@ -1120,6 +1448,18 @@ def uninstall_package(package_name: str):
|
|
|
1120
1448
|
import pip
|
|
1121
1449
|
pip.main(['uninstall', package_name, '-y'])
|
|
1122
1450
|
|
|
1451
|
+
# A list of conda configuration commands.
|
|
1452
|
+
conda_mirror_commands=[
|
|
1453
|
+
"pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple", # Windows recommended
|
|
1454
|
+
"conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/", # MacOS recommended
|
|
1455
|
+
"conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/",
|
|
1456
|
+
"conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/",
|
|
1457
|
+
"conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/fastai/",
|
|
1458
|
+
"conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/",
|
|
1459
|
+
"conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/",
|
|
1460
|
+
"pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/"
|
|
1461
|
+
]
|
|
1462
|
+
|
|
1123
1463
|
def DirList(root_dir: str) -> tuple:
|
|
1124
1464
|
"""
|
|
1125
1465
|
List the contents of a directory and return two lists containing the names of the directories and files in the directory.
|
|
@@ -1223,10 +1563,8 @@ def sort_strings_with_embedded_numbers(strings: list) -> list:
|
|
|
1223
1563
|
"""
|
|
1224
1564
|
# Sort the strings using the extract_numbers() function as the key
|
|
1225
1565
|
sorted_strings=sorted(strings, key=extract_numbers)
|
|
1226
|
-
|
|
1227
1566
|
return sorted_strings
|
|
1228
1567
|
|
|
1229
|
-
|
|
1230
1568
|
def run_command(command: str) -> str:
|
|
1231
1569
|
"""
|
|
1232
1570
|
Run a command and return its output as a string.
|
|
@@ -1416,7 +1754,6 @@ class PGScraper(object):
|
|
|
1416
1754
|
return all_want_list
|
|
1417
1755
|
|
|
1418
1756
|
|
|
1419
|
-
|
|
1420
1757
|
|
|
1421
1758
|
# -*- coding: utf-8 -*-
|
|
1422
1759
|
"""
|
|
@@ -1462,7 +1799,6 @@ class ProgressData(object):
|
|
|
1462
1799
|
self.size, self.unit, progress, speed, self.unit))
|
|
1463
1800
|
print('%50s'%('/'*int((1-progress)*50)))
|
|
1464
1801
|
|
|
1465
|
-
|
|
1466
1802
|
def levenshtein_distance(s, t):
|
|
1467
1803
|
m, n=len(s), len(t)
|
|
1468
1804
|
if m < n:
|
|
@@ -1488,7 +1824,7 @@ pgs_abbres_words=['A.B.','A.D.','A.G.','A.I.','A.M.','A.P.','A.V.','AFP.','Ala.'
|
|
|
1488
1824
|
def clean_text(text): #清洗除了句号以外的其他标点符号问题
|
|
1489
1825
|
# 在标点符号右边邻接单词前添加空格
|
|
1490
1826
|
import re
|
|
1491
|
-
text=replace_chinese_punctuation_with_english(text)
|
|
1827
|
+
# text=replace_chinese_punctuation_with_english(text)
|
|
1492
1828
|
text=re.sub(r'(?<=[\?\!\,\;\:\)\]\}])\s*(?=\w)', ' ', text)
|
|
1493
1829
|
# 删除标点符号与左边单词之间的空格
|
|
1494
1830
|
text=re.sub(r'\s*([\?\!\,\;\:\)\]\}\>])', r'\1', text)
|
|
@@ -1504,24 +1840,45 @@ def clean_text(text): #清洗除了句号以外的其他标点符号问题
|
|
|
1504
1840
|
|
|
1505
1841
|
def clean_text_with_abbreviations(text):
|
|
1506
1842
|
import re
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1843
|
+
|
|
1844
|
+
# 按行分割文本
|
|
1845
|
+
lines = text.splitlines()
|
|
1846
|
+
|
|
1847
|
+
# 清洗每一行
|
|
1848
|
+
cleaned_lines = []
|
|
1849
|
+
for line in lines:
|
|
1850
|
+
cleaned_line = clean_line_with_abbreviations(line)
|
|
1851
|
+
cleaned_lines.append(cleaned_line)
|
|
1852
|
+
|
|
1853
|
+
# 将清洗后的行重新组合成文本
|
|
1854
|
+
cleaned_text = '\n'.join(cleaned_lines)
|
|
1855
|
+
return cleaned_text
|
|
1856
|
+
|
|
1857
|
+
def clean_line_with_abbreviations(line):
|
|
1858
|
+
import re
|
|
1859
|
+
|
|
1860
|
+
# 清洗除了句号以外的其他标点符号问题
|
|
1861
|
+
line = clean_text(line)
|
|
1862
|
+
|
|
1863
|
+
matches = []
|
|
1864
|
+
for seg in line.split():
|
|
1510
1865
|
if "." in seg:
|
|
1511
|
-
if seg.endswith(".")
|
|
1866
|
+
if not seg.endswith("."):
|
|
1512
1867
|
matches.append(seg)
|
|
1513
1868
|
elif seg.endswith("..") and "..." not in seg:
|
|
1514
|
-
|
|
1515
|
-
|
|
1869
|
+
line = line.replace("..", ".")
|
|
1870
|
+
|
|
1516
1871
|
for match in matches:
|
|
1517
1872
|
if any(word in match for word in pgs_abbres_words):
|
|
1518
|
-
inter=match.split(".")
|
|
1519
|
-
new_match="".join([w+"." for w in inter[0:-1]])+" "+inter[-1]
|
|
1520
|
-
|
|
1873
|
+
inter = match.split(".")
|
|
1874
|
+
new_match = "".join([w + "." for w in inter[0:-1]]) + " " + inter[-1]
|
|
1875
|
+
line = line.replace(match, new_match)
|
|
1521
1876
|
else:
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1877
|
+
line = line.replace(match, match.replace(".", ". "))
|
|
1878
|
+
|
|
1879
|
+
line = re.sub(r'\s+\.', '.', line)
|
|
1880
|
+
return line
|
|
1881
|
+
|
|
1525
1882
|
|
|
1526
1883
|
import shutil
|
|
1527
1884
|
def move_file(source_file, destination_folder, new_file_name=None):
|
|
@@ -1547,6 +1904,28 @@ def move_file(source_file, destination_folder, new_file_name=None):
|
|
|
1547
1904
|
shutil.move(source_file, destination_file)
|
|
1548
1905
|
|
|
1549
1906
|
print(f"File moved from {source_file} to {destination_file}")
|
|
1907
|
+
|
|
1908
|
+
def copy_file(source_file, destination_folder, new_file_name=None):
|
|
1909
|
+
"""
|
|
1910
|
+
Copy a file to another folder.
|
|
1911
|
+
|
|
1912
|
+
Parameters:
|
|
1913
|
+
source_file (str): The path to the source file.
|
|
1914
|
+
destination_folder (str): The path to the destination folder.
|
|
1915
|
+
new_file_name (str, optional): The new name for the file in the destination folder. Defaults to None.
|
|
1916
|
+
"""
|
|
1917
|
+
# Ensure the destination folder exists
|
|
1918
|
+
if not os.path.exists(destination_folder):
|
|
1919
|
+
os.makedirs(destination_folder)
|
|
1920
|
+
|
|
1921
|
+
# Construct the destination file path
|
|
1922
|
+
if new_file_name:
|
|
1923
|
+
destination_file = os.path.join(destination_folder, new_file_name)
|
|
1924
|
+
else:
|
|
1925
|
+
destination_file = os.path.join(destination_folder, os.path.basename(source_file))
|
|
1926
|
+
|
|
1927
|
+
# Copy the file to the destination folder
|
|
1928
|
+
shutil.copy2(source_file, destination_file)
|
|
1550
1929
|
|
|
1551
1930
|
def check_empty_cells(file_path):
|
|
1552
1931
|
"""
|
|
@@ -1585,7 +1964,6 @@ def makefile(file_path):
|
|
|
1585
1964
|
else:
|
|
1586
1965
|
write_to_txt(file_path, "")
|
|
1587
1966
|
|
|
1588
|
-
|
|
1589
1967
|
def save_dict_to_excel(data, output_file, headers=None):
|
|
1590
1968
|
"""
|
|
1591
1969
|
Save Python dictionary data into an Excel .xlsx file with custom headers.
|
|
@@ -1794,4 +2172,57 @@ def get_stopwords(language=None):
|
|
|
1794
2172
|
return en_stopwords
|
|
1795
2173
|
else:
|
|
1796
2174
|
lang_stopwords=get_data_lines(find_txt_files_with_keyword(stopwords_path, language)[0])
|
|
1797
|
-
return lang_stopwords
|
|
2175
|
+
return lang_stopwords
|
|
2176
|
+
|
|
2177
|
+
from PIL import Image
|
|
2178
|
+
def replace_white_with_transparency(input_path, output_path):
|
|
2179
|
+
"""
|
|
2180
|
+
This function opens an image, replaces all white pixels with transparent pixels.
|
|
2181
|
+
|
|
2182
|
+
Parameters:
|
|
2183
|
+
input_path (str): The path to the input image file.
|
|
2184
|
+
output_path (str): The path to save the output image file.
|
|
2185
|
+
"""
|
|
2186
|
+
# 从RGB(24位)模式转成RGBA(32位)模式
|
|
2187
|
+
img = Image.open(input_path).convert('RGBA')
|
|
2188
|
+
W, L = img.size
|
|
2189
|
+
white_pixel = (0, 0, 0, 0) # white
|
|
2190
|
+
for h in range(W):
|
|
2191
|
+
for i in range(L):
|
|
2192
|
+
if img.getpixel((h, i)) == white_pixel:
|
|
2193
|
+
img.putpixel((h, i), (255, 255, 255, 0)) # make it transparent
|
|
2194
|
+
img.save(output_path)
|
|
2195
|
+
|
|
2196
|
+
def get_font_path(font_name=None):
|
|
2197
|
+
'''
|
|
2198
|
+
Retrieves the file path of a specified font.
|
|
2199
|
+
|
|
2200
|
+
Parameters
|
|
2201
|
+
----------
|
|
2202
|
+
font_name : str, optional
|
|
2203
|
+
The name of the font file (must end with ".ttf"). If provided, it should match one of the available fonts in the library, such as:
|
|
2204
|
+
- 'DejaVuSans.ttf'
|
|
2205
|
+
- '书体坊赵九江钢笔行书体.ttf'
|
|
2206
|
+
- '全新硬笔楷书简.ttf'
|
|
2207
|
+
- '全新硬笔行书简.ttf'
|
|
2208
|
+
- '博洋行书3500.TTF'
|
|
2209
|
+
- '陆柬之行书字体.ttf'
|
|
2210
|
+
The default is None, which will return the path for 'DejaVuSans.ttf'.
|
|
2211
|
+
|
|
2212
|
+
Returns
|
|
2213
|
+
-------
|
|
2214
|
+
font_path : str
|
|
2215
|
+
The full file path of the specified font. If no font name is provided, the default path for 'DejaVuSans.ttf' will be returned.
|
|
2216
|
+
Example: "C:/Windows/Fonts/simhei.ttf"
|
|
2217
|
+
'''
|
|
2218
|
+
|
|
2219
|
+
font_folder = get_library_location("PgsFile") + "/PgsFile/models/fonts"
|
|
2220
|
+
if font_name is None:
|
|
2221
|
+
font_path = get_full_path(font_folder, "DejaVuSans.ttf")
|
|
2222
|
+
else:
|
|
2223
|
+
font_path = get_full_path(font_folder, font_name)
|
|
2224
|
+
return font_path
|
|
2225
|
+
|
|
2226
|
+
simhei_default_font_path_MacOS_Windows=["/System/Library/Fonts/STHeiti Medium.ttc",
|
|
2227
|
+
r"C:\Windows\Fonts\simhei.ttf", # Use a font that supports Chinese characters
|
|
2228
|
+
]
|
PgsFile/__init__.py
CHANGED
|
@@ -7,6 +7,7 @@ from .PgsFile import headers, encode_chinese_keyword_for_url
|
|
|
7
7
|
from .PgsFile import install_package, uninstall_package
|
|
8
8
|
from .PgsFile import run_script, run_command
|
|
9
9
|
from .PgsFile import get_library_location
|
|
10
|
+
from .PgsFile import conda_mirror_commands
|
|
10
11
|
|
|
11
12
|
# 3. Text data retrieval
|
|
12
13
|
from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
|
|
@@ -19,10 +20,10 @@ from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_
|
|
|
19
20
|
|
|
20
21
|
# 5. File/folder process
|
|
21
22
|
from .PgsFile import FilePath, FileName, DirList
|
|
22
|
-
from .PgsFile import get_subfolder_path
|
|
23
|
+
from .PgsFile import get_subfolder_path, get_full_path
|
|
23
24
|
from .PgsFile import makedirec, makefile
|
|
24
25
|
from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
|
|
25
|
-
from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
|
|
26
|
+
from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file, copy_file
|
|
26
27
|
from .PgsFile import concatenate_excel_files
|
|
27
28
|
|
|
28
29
|
# 6. Data cleaning
|
|
@@ -32,18 +33,23 @@ from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, L
|
|
|
32
33
|
from .PgsFile import check_contain_chinese, check_contain_number
|
|
33
34
|
from .PgsFile import replace_chinese_punctuation_with_english
|
|
34
35
|
from .PgsFile import replace_english_punctuation_with_chinese
|
|
35
|
-
from .PgsFile import clean_list, clean_text_with_abbreviations
|
|
36
|
+
from .PgsFile import clean_list, clean_text, clean_text_with_abbreviations, clean_line_with_abbreviations
|
|
36
37
|
from .PgsFile import extract_chinese_punctuation, generate_password, sort_strings_with_embedded_numbers
|
|
37
38
|
|
|
38
39
|
# 7. NLP (natural language processing)
|
|
39
40
|
from .PgsFile import strQ2B_raw, strQ2B_words
|
|
40
41
|
from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
|
|
41
42
|
from .PgsFile import word_list, batch_word_list
|
|
42
|
-
from .PgsFile import cs, cs1, sent_tokenize, word_tokenize
|
|
43
|
+
from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
|
|
43
44
|
|
|
44
45
|
# 8. Maths
|
|
45
46
|
from .PgsFile import len_rows, check_empty_cells
|
|
46
47
|
from .PgsFile import format_float, decimal_to_percent, Percentage
|
|
47
48
|
from .PgsFile import get_text_length_kb, extract_numbers
|
|
48
49
|
|
|
50
|
+
# 9. Visualization
|
|
51
|
+
from .PgsFile import replace_white_with_transparency
|
|
52
|
+
from .PgsFile import simhei_default_font_path_MacOS_Windows
|
|
53
|
+
from .PgsFile import get_font_path
|
|
54
|
+
|
|
49
55
|
name = "PgsFile"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
model_1.0.bin ['samples: 30', 'precision: 0.7666666666666667', 'recall: 0.696969696969697', 'F1: 0.7301587301587302']
|
|
2
|
+
model_1.2.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
|
|
3
|
+
model_1.4.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
|
|
4
|
+
model_1.5.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
|
|
5
|
+
model_1.6.bin ['samples: 30', 'precision: 0.9', 'recall: 0.8181818181818182', 'F1: 0.8571428571428572']
|
|
6
|
+
model_1.7.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.7878787878787878', 'F1: 0.8253968253968254']
|
|
7
|
+
model_1.8.bin ['samples: 30', 'precision: 0.8', 'recall: 0.7272727272727273', 'F1: 0.761904761904762']
|
|
8
|
+
model_1.9.bin ['samples: 30', 'precision: 0.8', 'recall: 0.7272727272727273', 'F1: 0.761904761904762']
|
|
9
|
+
model_2.0.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
|
|
10
|
+
model_2.1.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.7878787878787878', 'F1: 0.8253968253968254']
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
model_1.0.bin ['samples: 292', 'precision: 0.5787671232876712', 'recall: 0.48011363636363635', 'F1: 0.5248447204968945']
|
|
14
|
+
model_1.2.bin ['samples: 292', 'precision: 0.636986301369863', 'recall: 0.5284090909090909', 'F1: 0.577639751552795']
|
|
15
|
+
model_1.4.bin ['samples: 292', 'precision: 0.7191780821917808', 'recall: 0.5965909090909091', 'F1: 0.6521739130434782']
|
|
16
|
+
model_1.5.bin ['samples: 292', 'precision: 0.6815068493150684', 'recall: 0.5653409090909091', 'F1: 0.6180124223602484']
|
|
17
|
+
model_1.6.bin ['samples: 292', 'precision: 0.726027397260274', 'recall: 0.6022727272727273', 'F1: 0.6583850931677019']
|
|
18
|
+
model_1.7.bin ['samples: 292', 'precision: 0.7363013698630136', 'recall: 0.6107954545454546', 'F1: 0.6677018633540373']
|
|
19
|
+
model_1.8.bin ['samples: 292', 'precision: 0.7431506849315068', 'recall: 0.6164772727272727', 'F1: 0.6739130434782609']
|
|
20
|
+
model_1.9.bin ['samples: 292', 'precision: 0.7773972602739726', 'recall: 0.6448863636363636', 'F1: 0.7049689440993789']
|
|
21
|
+
model_2.0.bin ['samples: 292', 'precision: 0.7636986301369864', 'recall: 0.6335227272727273', 'F1: 0.6925465838509317']
|
|
22
|
+
model_2.1.bin ['samples: 292', 'precision: 0.7671232876712328', 'recall: 0.6363636363636364', 'F1: 0.6956521739130435']
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
model_1.0.bin ['samples: 322', 'precision: 0.5962732919254659', 'recall: 0.4987012987012987', 'F1: 0.5431400282885432']
|
|
26
|
+
model_1.2.bin ['samples: 322', 'precision: 0.65527950310559', 'recall: 0.548051948051948', 'F1: 0.5968882602545968']
|
|
27
|
+
model_1.4.bin ['samples: 322', 'precision: 0.7267080745341615', 'recall: 0.6077922077922078', 'F1: 0.6619519094766619']
|
|
28
|
+
model_1.5.bin ['samples: 322', 'precision: 0.6956521739130435', 'recall: 0.5818181818181818', 'F1: 0.6336633663366337']
|
|
29
|
+
model_1.6.bin ['samples: 322', 'precision: 0.7422360248447205', 'recall: 0.6207792207792208', 'F1: 0.6760961810466761']
|
|
30
|
+
model_1.7.bin ['samples: 322', 'precision: 0.7484472049689441', 'recall: 0.625974025974026', 'F1: 0.6817538896746819']
|
|
31
|
+
model_1.8.bin ['samples: 322', 'precision: 0.7484472049689441', 'recall: 0.625974025974026', 'F1: 0.6817538896746819']
|
|
32
|
+
model_1.9.bin ['samples: 322', 'precision: 0.7795031055900621', 'recall: 0.6519480519480519', 'F1: 0.71004243281471']
|
|
33
|
+
model_2.0.bin ['samples: 322', 'precision: 0.7701863354037267', 'recall: 0.6441558441558441', 'F1: 0.7015558698727016']
|
|
34
|
+
model_2.1.bin ['samples: 322', 'precision: 0.7763975155279503', 'recall: 0.6493506493506493', 'F1: 0.7072135785007072']
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
=========================================================非重复验证集==================================================
|
|
38
|
+
|
|
39
|
+
model_1.2.bin ['samples: 303', 'precision: 0.6435643564356436', 'recall: 0.5342465753424658', 'F1: 0.5838323353293414']
|
|
40
|
+
model_1.4.bin ['samples: 303', 'precision: 0.7161716171617162', 'recall: 0.5945205479452055', 'F1: 0.6497005988023953']
|
|
41
|
+
model_1.5.bin ['samples: 303', 'precision: 0.6864686468646864', 'recall: 0.5698630136986301', 'F1: 0.6227544910179641']
|
|
42
|
+
model_1.6.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.6082191780821918', 'F1: 0.6646706586826348']
|
|
43
|
+
model_1.7.bin ['samples: 303', 'precision: 0.7425742574257426', 'recall: 0.6164383561643836', 'F1: 0.6736526946107784']
|
|
44
|
+
model_1.8.bin ['samples: 303', 'precision: 0.7392739273927392', 'recall: 0.6136986301369863', 'F1: 0.6706586826347306']
|
|
45
|
+
model_1.9.bin ['samples: 303', 'precision: 0.7722772277227723', 'recall: 0.6410958904109589', 'F1: 0.7005988023952096']
|
|
46
|
+
model_2.0.bin ['samples: 303', 'precision: 0.759075907590759', 'recall: 0.6301369863013698', 'F1: 0.688622754491018']
|
|
47
|
+
model_2.1.bin ['samples: 303', 'precision: 0.7623762376237624', 'recall: 0.6328767123287671', 'F1: 0.6916167664670658']
|
|
48
|
+
model_2.2.bin ['samples: 303', 'precision: 0.7458745874587459', 'recall: 0.6191780821917808', 'F1: 0.6766467065868264']
|
|
49
|
+
|
|
50
|
+
=================================================非重复验证集+5分标签==================================================
|
|
51
|
+
|
|
52
|
+
model_1.2.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
|
|
53
|
+
model_1.4.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.8125', 'F1: 0.8387096774193549']
|
|
54
|
+
model_1.5.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
|
|
55
|
+
model_1.6.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
|
|
56
|
+
model_1.7.bin ['samples: 30', 'precision: 0.8', 'recall: 0.75', 'F1: 0.7741935483870969']
|
|
57
|
+
model_1.8.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
|
|
58
|
+
model_1.9.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
|
|
59
|
+
model_2.0.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
|
|
60
|
+
model_2.1.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
|
|
61
|
+
model_2.2.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
model_1.2.bin ['samples: 302', 'precision: 0.6721854304635762', 'recall: 0.6444444444444445', 'F1: 0.6580226904376014']
|
|
65
|
+
model_1.4.bin ['samples: 302', 'precision: 0.7019867549668874', 'recall: 0.6730158730158731', 'F1: 0.6871961102106969']
|
|
66
|
+
model_1.5.bin ['samples: 302', 'precision: 0.7185430463576159', 'recall: 0.6888888888888889', 'F1: 0.7034035656401946']
|
|
67
|
+
model_1.6.bin ['samples: 302', 'precision: 0.7086092715231788', 'recall: 0.6793650793650794', 'F1: 0.6936790923824959']
|
|
68
|
+
model_1.7.bin ['samples: 302', 'precision: 0.7052980132450332', 'recall: 0.6761904761904762', 'F1: 0.6904376012965965']
|
|
69
|
+
model_1.8.bin ['samples: 302', 'precision: 0.7317880794701986', 'recall: 0.7015873015873015', 'F1: 0.7163695299837927']
|
|
70
|
+
model_1.9.bin ['samples: 302', 'precision: 0.7317880794701986', 'recall: 0.7015873015873015', 'F1: 0.7163695299837927']
|
|
71
|
+
model_2.0.bin ['samples: 302', 'precision: 0.7417218543046358', 'recall: 0.7111111111111111', 'F1: 0.7260940032414911']
|
|
72
|
+
model_2.1.bin ['samples: 302', 'precision: 0.7516556291390728', 'recall: 0.7206349206349206', 'F1: 0.7358184764991895']
|
|
73
|
+
model_2.2.bin ['samples: 302', 'precision: 0.7582781456953642', 'recall: 0.726984126984127', 'F1: 0.7423014586709886']
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
model_1.2.bin ['samples: 303', 'precision: 0.6732673267326733', 'recall: 0.6455696202531646', 'F1: 0.6591276252019386']
|
|
77
|
+
model_1.4.bin ['samples: 303', 'precision: 0.7029702970297029', 'recall: 0.6740506329113924', 'F1: 0.6882067851373183']
|
|
78
|
+
model_1.5.bin ['samples: 303', 'precision: 0.7194719471947195', 'recall: 0.689873417721519', 'F1: 0.7043618739903069']
|
|
79
|
+
model_1.6.bin ['samples: 303', 'precision: 0.7095709570957096', 'recall: 0.680379746835443', 'F1: 0.6946688206785137']
|
|
80
|
+
model_1.7.bin ['samples: 303', 'precision: 0.7062706270627063', 'recall: 0.6772151898734177', 'F1: 0.6914378029079159']
|
|
81
|
+
model_1.8.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.7025316455696202', 'F1: 0.7172859450726979']
|
|
82
|
+
model_1.9.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.7025316455696202', 'F1: 0.7172859450726979']
|
|
83
|
+
model_2.0.bin ['samples: 303', 'precision: 0.7425742574257426', 'recall: 0.7120253164556962', 'F1: 0.7269789983844911']
|
|
84
|
+
model_2.1.bin ['samples: 303', 'precision: 0.7524752475247525', 'recall: 0.7215189873417721', 'F1: 0.7366720516962842']
|
|
85
|
+
model_2.2.bin ['samples: 303', 'precision: 0.759075907590759', 'recall: 0.7278481012658228', 'F1: 0.7431340872374799']
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
model_1.2.bin ['samples: 425', 'precision: 0.6470588235294118', 'recall: 0.5456349206349206', 'F1: 0.5920344456404736']
|
|
89
|
+
model_1.2.bin ['samples: 425', 'precision: 0.691764705882353', 'recall: 0.6621621621621622', 'F1: 0.6766398158803222']
|
|
90
|
+
model_1.4.bin ['samples: 425', 'precision: 0.7129411764705882', 'recall: 0.6824324324324325', 'F1: 0.6973532796317606']
|
|
91
|
+
model_1.5.bin ['samples: 425', 'precision: 0.7294117647058823', 'recall: 0.6981981981981982', 'F1: 0.713463751438435']
|
|
92
|
+
model_1.6.bin ['samples: 425', 'precision: 0.7129411764705882', 'recall: 0.6824324324324325', 'F1: 0.6973532796317606']
|
|
93
|
+
model_1.7.bin ['samples: 425', 'precision: 0.7105882352941176', 'recall: 0.6801801801801802', 'F1: 0.6950517836593786']
|
|
94
|
+
model_1.8.bin ['samples: 425', 'precision: 0.7505882352941177', 'recall: 0.7184684684684685', 'F1: 0.7341772151898734']
|
|
95
|
+
model_1.9.bin ['samples: 425', 'precision: 0.7529411764705882', 'recall: 0.7207207207207207', 'F1: 0.7364787111622554']
|
|
96
|
+
model_2.0.bin ['samples: 425', 'precision: 0.7670588235294118', 'recall: 0.7342342342342343', 'F1: 0.7502876869965478']
|
|
97
|
+
model_2.1.bin ['samples: 425', 'precision: 0.7717647058823529', 'recall: 0.7387387387387387', 'F1: 0.7548906789413118']
|
|
98
|
+
model_2.2.bin ['samples: 425', 'precision: 0.7764705882352941', 'recall: 0.7432432432432432', 'F1: 0.7594936708860759']
|
|
99
|
+
|
|
100
|
+
model_1.2.bin ['samples: 447', 'precision: 0.6935123042505593', 'recall: 0.6623931623931624', 'F1: 0.6775956284153005']
|
|
101
|
+
model_1.4.bin ['samples: 447', 'precision: 0.7158836689038032', 'recall: 0.6837606837606838', 'F1: 0.6994535519125684']
|
|
102
|
+
model_1.5.bin ['samples: 447', 'precision: 0.7337807606263982', 'recall: 0.7008547008547008', 'F1: 0.7169398907103826']
|
|
103
|
+
model_1.6.bin ['samples: 447', 'precision: 0.7203579418344519', 'recall: 0.688034188034188', 'F1: 0.7038251366120218']
|
|
104
|
+
model_1.7.bin ['samples: 447', 'precision: 0.7158836689038032', 'recall: 0.6837606837606838', 'F1: 0.6994535519125684']
|
|
105
|
+
model_1.8.bin ['samples: 447', 'precision: 0.7539149888143176', 'recall: 0.7200854700854701', 'F1: 0.7366120218579234']
|
|
106
|
+
model_1.9.bin ['samples: 447', 'precision: 0.7539149888143176', 'recall: 0.7200854700854701', 'F1: 0.7366120218579234']
|
|
107
|
+
model_2.0.bin ['samples: 447', 'precision: 0.7695749440715883', 'recall: 0.7350427350427351', 'F1: 0.7519125683060108']
|
|
108
|
+
model_2.1.bin ['samples: 447', 'precision: 0.7718120805369127', 'recall: 0.7371794871794872', 'F1: 0.7540983606557377']
|
|
109
|
+
model_2.2.bin ['samples: 447', 'precision: 0.7785234899328859', 'recall: 0.7435897435897436', 'F1: 0.760655737704918']
|
|
110
|
+
|
|
111
|
+
model_1.2.bin
|
|
112
|
+
model_1.4.bin
|
|
113
|
+
model_1.5.bin
|
|
114
|
+
model_1.6.bin
|
|
115
|
+
model_1.7.bin
|
|
116
|
+
model_1.8.bin
|
|
117
|
+
model_1.9.bin
|
|
118
|
+
model_2.0.bin
|
|
119
|
+
model_2.1.bin
|
|
120
|
+
model_2.2.bin
|
|
121
|
+
|
|
122
|
+
model_1.2.bin
|
|
123
|
+
model_1.4.bin
|
|
124
|
+
model_1.5.bin
|
|
125
|
+
model_1.6.bin
|
|
126
|
+
model_1.7.bin
|
|
127
|
+
model_1.8.bin
|
|
128
|
+
model_1.9.bin
|
|
129
|
+
model_2.0.bin
|
|
130
|
+
model_2.1.bin
|
|
131
|
+
model_2.2.bin
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: PgsFile
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary: This module streamlines Python package management, script execution, file handling, web scraping, multimedia downloads, data cleaning, and NLP tasks such as word tokenization and POS tagging. It also assists with generating word lists and plotting data, making these tasks more accessible and convenient for literary students. Whether you need to scrape data from websites, clean text, or analyze language, this module provides user-friendly tools to simplify your workflow.
|
|
5
|
+
Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
|
|
6
|
+
Author: Pan Guisheng
|
|
7
|
+
Author-email: 895284504@qq.com
|
|
8
|
+
License: Educational free
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: Free For Educational Use
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: chardet
|
|
16
|
+
Requires-Dist: pandas
|
|
17
|
+
Requires-Dist: python-docx
|
|
18
|
+
Requires-Dist: pip
|
|
19
|
+
Requires-Dist: requests
|
|
20
|
+
Requires-Dist: fake-useragent
|
|
21
|
+
Requires-Dist: lxml
|
|
22
|
+
Requires-Dist: pimht
|
|
23
|
+
Requires-Dist: pysbd
|
|
24
|
+
Requires-Dist: nlpir-python
|
|
25
|
+
Requires-Dist: pillow
|
|
26
|
+
|
|
27
|
+
Purpose: This module is designed to make complex tasks accessible and convenient, even for beginners. By providing a unified set of tools, it simplifies the workflow for data collection, processing, and analysis. Whether you're scraping data from the web, cleaning text, or performing NLP tasks, this module ensures you can focus on your research without getting bogged down by technical challenges.
|
|
28
|
+
|
|
29
|
+
Key Features:
|
|
30
|
+
1. Web Scraping: Easily scrape data from websites and download multimedia content.
|
|
31
|
+
2. Package Management: Install, uninstall, and manage Python packages with simple commands.
|
|
32
|
+
3. Data Retrieval: Extract data from various file formats like text, JSON, TSV, Excel, and HTML (both online and offline).
|
|
33
|
+
4. Data Storage: Write and append data to text files, Excel, JSON, and JSON lines.
|
|
34
|
+
5. File and Folder Processing: Manage file paths, create directories, move or copy files, and search for files with specific keywords.
|
|
35
|
+
6. Data Cleaning: Clean text, handle punctuation, remove stopwords, and prepare data for analysis.
|
|
36
|
+
7. NLP: Perform tokenization, generate n-grams, and create word lists for text analysis.
|
|
37
|
+
8. Math Operations: Format numbers, convert decimals to percentages, and validate data.
|
|
38
|
+
9. Visualization: Process images (e.g., make white pixels transparent) and manage fonts for rendering text.
|
|
39
|
+
|
|
40
|
+
Author: Pan Guisheng, a PhD student at the Graduate Institute of Interpretation and Translation of Shanghai International Studies University
|
|
41
|
+
E-mail: 895284504@qq.com
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256
|
|
1
|
+
PgsFile/PgsFile.py,sha256=V_Pnn5hljeR9xYQ8hyUAmf92140N4ORoQOe-cdBHJos,101212
|
|
2
|
+
PgsFile/__init__.py,sha256=C-uX4tN3J3L5Zr6r8qQx0zwNaG0UXTowxK_K0pd0Jt4,2680
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2569,6 +2569,7 @@ PgsFile/Corpora/Parallel/TED_EC_2017-2020/YvetteAlberdingkThijm_2017X公民影
|
|
|
2569
2569
|
PgsFile/Corpora/Parallel/TED_EC_2017-2020/ZacharyRWood_2018为什么持有不同意见的人值得被聆听..txt,sha256=4SFYMhlFSHP2aEVvNS1CBeogq0D2lPTE5VhFsZjlZnM,9546
|
|
2570
2570
|
PgsFile/Corpora/Parallel/TED_EC_2017-2020/ZeynepTufekci_2017G为了让人们点击广告_我们正在建造一个反乌托邦..txt,sha256=S3BSXKsNAX0ugVqBPhmJyaRF8MYAHapDMR12DoBYZgc,32353
|
|
2571
2571
|
PgsFile/Corpora/Parallel/Xi's Speech_CE_2021/Speech at a Ceremony Marking the Centenary of the CPC.txt,sha256=3suCjs2LF2_Endg2i_hc3GX1N8lTBORlqpMWEKsXFeM,54282
|
|
2572
|
+
PgsFile/Corpora/Stopwords/NLPIR.user,sha256=DykLJdr8_cVHrdCnDJES1O5dgmnYqfaSO1_dtAVKYJk,3356
|
|
2572
2573
|
PgsFile/Corpora/Stopwords/arabic.txt,sha256=yL9id0vdNF20WEvM0buRnRt1ByEeRGJuGDiY3jE7tlQ,1287
|
|
2573
2574
|
PgsFile/Corpora/Stopwords/bulgarian.txt,sha256=eiIwYk1TU8YcYYPbMPjUzZSZlgd7gl5o7d0LIthzqHQ,2409
|
|
2574
2575
|
PgsFile/Corpora/Stopwords/catalan.txt,sha256=8OyAOBHfWsEvKuLEphCfdiWhuxyVg1sOWV5gi2DJLwY,699
|
|
@@ -2599,6 +2600,7 @@ PgsFile/Corpora/Stopwords/turkish.txt,sha256=uGUvjEm2GR8PuVY_JeHNxhD7cWlNlF7vc3V
|
|
|
2599
2600
|
PgsFile/Corpora/Stopwords/ukrainian.txt,sha256=fEzWLTwnWJriILkO-5jSfE2SpqY-GPf_kR4zid3MFUI,4131
|
|
2600
2601
|
PgsFile/Corpora/Stopwords/vietnamese.txt,sha256=88yRtVMaRSFqas1iGGa6kOGDCZTgtzRPmR3q9dHshdc,20485
|
|
2601
2602
|
PgsFile/Corpora/Terminology/Chinese_Thought.json,sha256=CdkuF2wLaDC5V3sRefcU1RZwXm4-wTZ-Qfk8r7gsu8I,2301866
|
|
2603
|
+
PgsFile/models/NLPIR.user,sha256=DykLJdr8_cVHrdCnDJES1O5dgmnYqfaSO1_dtAVKYJk,3356
|
|
2602
2604
|
PgsFile/models/czech.pickle,sha256=W6c9KTx9eVOVa88C82lexcHw1Sfyo8OAl_VZM5T6FpA,1265552
|
|
2603
2605
|
PgsFile/models/danish.pickle,sha256=6il2CgqRl_UspZ54rq_FpvVdBSWPr32xcJsrnrMh7yA,1264725
|
|
2604
2606
|
PgsFile/models/dutch.pickle,sha256=So4ms9aMRcOOWU0Z4tVndEe_3KpjbTsees_tDpJy1zw,742624
|
|
@@ -2610,6 +2612,8 @@ PgsFile/models/german.pickle,sha256=6rSX-ghUExMMj9D7E7kpEokwr-L2om6ocVyV33CI6Xw,
|
|
|
2610
2612
|
PgsFile/models/greek.pickle,sha256=IXUqZ2L61c_kb7XEX62ahUhKDo6Bxn5q9vuXPPwn1nw,1953106
|
|
2611
2613
|
PgsFile/models/italian.pickle,sha256=3LJxfXvl8m6GCpLgWs9psRI6X0UnzXommpq56eZoyAU,658331
|
|
2612
2614
|
PgsFile/models/malayalam.pickle,sha256=H4z1isvbf0cqxAr_wTZjvkLa-0fBUDDBGt4ERMng5T0,221207
|
|
2615
|
+
PgsFile/models/model_reviews2.2.bin,sha256=D6uL8KZIxD0rfWjH0kYEb7z_HE4aTJXpj82HzsCOpuk,1943196
|
|
2616
|
+
PgsFile/models/model_reviews_ReadMe.txt,sha256=Q9uLJwudMmsTKfd11l1tOcIP8lwsemIwnAVJG_3SYjU,11433
|
|
2613
2617
|
PgsFile/models/norwegian.pickle,sha256=5Kl_j5oDoDON10a8yJoK4PVK5DuDX6N9g-J54cp5T68,1259779
|
|
2614
2618
|
PgsFile/models/polish.pickle,sha256=FhJ7bRCTNCej6Q-yDpvlPh-zcf95pzDBAwc07YC5DJI,2042451
|
|
2615
2619
|
PgsFile/models/portuguese.pickle,sha256=uwG_fHmk6twheLvSCWZROaDks48tHET-8Jfek5VRQOA,649051
|
|
@@ -2618,8 +2622,14 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
|
|
|
2618
2622
|
PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
|
|
2619
2623
|
PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
|
|
2620
2624
|
PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
|
|
2621
|
-
PgsFile
|
|
2622
|
-
PgsFile
|
|
2623
|
-
PgsFile
|
|
2624
|
-
PgsFile
|
|
2625
|
-
PgsFile
|
|
2625
|
+
PgsFile/models/fonts/DejaVuSans.ttf,sha256=faGVp0xVvvmI0NSPlQi9XYSUJcF3Dbpde_xs6e2EiVQ,757076
|
|
2626
|
+
PgsFile/models/fonts/书体坊赵九江钢笔行书体.ttf,sha256=fTOv4FFMnYtN1zCZghJ6-P1pzznA5qqoujwpDFY63Ek,3140656
|
|
2627
|
+
PgsFile/models/fonts/全新硬笔楷书简.ttf,sha256=mPemGYMpgQxvFL1pFjjnyUMIprHzcoOaw8oeZQ4k1x0,2397296
|
|
2628
|
+
PgsFile/models/fonts/全新硬笔行书简.ttf,sha256=bUtbl71eK_ellp1z0tCmmR_P-JhqVFIpzeuRlrEBo9g,2611516
|
|
2629
|
+
PgsFile/models/fonts/博洋行书3500.TTF,sha256=VrgeHr8cgOL6JD05QyuD9ZSyw4J2aIVxKxW8zSajq6Q,4410732
|
|
2630
|
+
PgsFile/models/fonts/陆柬之行书字体.ttf,sha256=Zpd4Z7E9w-Qy74yklXHk4vM7HOtHuQgllvygxZZ1Hvs,1247288
|
|
2631
|
+
PgsFile-0.2.4.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2632
|
+
PgsFile-0.2.4.dist-info/METADATA,sha256=JC1a8Xrh3tDt5-HNnCJY3V4tNYKstu83V2qo_FqkATY,2711
|
|
2633
|
+
PgsFile-0.2.4.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2634
|
+
PgsFile-0.2.4.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2635
|
+
PgsFile-0.2.4.dist-info/RECORD,,
|
PgsFile-0.2.2.dist-info/METADATA
DELETED
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: PgsFile
|
|
3
|
-
Version: 0.2.2
|
|
4
|
-
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, NLP tasks like Chinese word tokenization and POS tagging, and word list generation for literary students, making it more accessible and convenient to use.
|
|
5
|
-
Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
|
|
6
|
-
Author: Pan Guisheng
|
|
7
|
-
Author-email: 895284504@qq.com
|
|
8
|
-
License: Educational free
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: License :: Free For Educational Use
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Requires-Python: >=3.8
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
License-File: LICENSE
|
|
15
|
-
Requires-Dist: chardet
|
|
16
|
-
Requires-Dist: pandas
|
|
17
|
-
Requires-Dist: python-docx
|
|
18
|
-
Requires-Dist: pip
|
|
19
|
-
Requires-Dist: requests
|
|
20
|
-
Requires-Dist: fake-useragent
|
|
21
|
-
Requires-Dist: lxml
|
|
22
|
-
Requires-Dist: pimht
|
|
23
|
-
Requires-Dist: pysbd
|
|
24
|
-
Requires-Dist: nlpir-python
|
|
25
|
-
|
|
26
|
-
Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
Function 1: Enables efficient data retrieval and storage in files with a single line of code.
|
|
30
|
-
|
|
31
|
-
Function 2: Facilitates retrieval of all absolute file paths and file names in any folder (including sub-folders) with a single line of code using "FilePath" and "FileName" functions.
|
|
32
|
-
|
|
33
|
-
Function 3: Simplifies creation of word lists and frequency sorting from a file or batch of files using "word_list" and "batch_word_list" functions in PgsFile.
|
|
34
|
-
|
|
35
|
-
Function 4: Pgs-Corpora is a comprehensive language resource included in this library, featuring a monolingual corpus of native and translational Chinese and native and non-native English, as well as a bi-directional parallel corpus of Chinese and English texts covering financial, legal, political, academic, and sports news topics. Additionally, the library includes a collection of 8774 English idioms, stopwords for 28 languages, and a termbank of Chinese thought and culture.
|
|
36
|
-
|
|
37
|
-
Function 5: This library provides support for common text cleaning tasks, such as removing empty text, empty lines, and folders containing empty text. It also offers functions for converting full-width characters to half-width characters and vice versa, as well as standardizing the format of Chinese and English punctuation. These features can help improve the quality and consistency of text data used in natural language processing tasks.
|
|
38
|
-
|
|
39
|
-
Function 6: It also manages Python package installations and uninstallations, and allows running scripts and commands in Python interactive command lines instead of Windows command prompt.
|
|
40
|
-
|
|
41
|
-
Function 7: Download audiovisual files like videos, images, and audio using audiovisual_downloader, which is extremely useful and efficient. Additionally, scrape newspaper data with PGScraper, a highly efficient tool for this purpose.
|
|
42
|
-
|
|
43
|
-
Table 1: The directory and size of Pgs-Corpora
|
|
44
|
-
├── Idioms (1, 171.78 KB)
|
|
45
|
-
├── Monolingual (2197, 63.65 MB)
|
|
46
|
-
│ ├── Chinese (456, 15.27 MB)
|
|
47
|
-
│ │ ├── People's Daily 20130605 (396, 1.38 MB)
|
|
48
|
-
│ │ │ ├── Raw (132, 261.73 KB)
|
|
49
|
-
│ │ │ ├── Seg_only (132, 471.47 KB)
|
|
50
|
-
│ │ │ └── Tagged (132, 675.30 KB)
|
|
51
|
-
│ │ └── Translational Fictions (60, 13.89 MB)
|
|
52
|
-
│ └── English (1741, 48.38 MB)
|
|
53
|
-
│ ├── Native (65, 44.14 MB)
|
|
54
|
-
│ │ ├── A Short Collection of British Fiction (27, 33.90 MB)
|
|
55
|
-
│ │ └── Preschoolers- and Teenagers-oriented Texts in English (36, 10.24 MB)
|
|
56
|
-
│ ├── Non-native (1675, 3.63 MB)
|
|
57
|
-
│ │ └── Shanghai Daily (1675, 3.63 MB)
|
|
58
|
-
│ │ └── Business_2019 (1675, 3.63 MB)
|
|
59
|
-
│ │ ├── 2019-01-01 (1, 3.35 KB)
|
|
60
|
-
│ │ ├── 2019-01-02 (1, 3.65 KB)
|
|
61
|
-
│ │ ├── 2019-01-03 (7, 10.90 KB)
|
|
62
|
-
│ │ ├── 2019-01-04 (5, 9.63 KB)
|
|
63
|
-
│ │ └── 2019-01-07 (4, 9.50 KB)
|
|
64
|
-
│ │ └── ... (and 245 more directories)
|
|
65
|
-
│ └── Translational (1, 622.57 KB)
|
|
66
|
-
├── Parallel (371, 24.67 MB)
|
|
67
|
-
│ ├── HK Financial and Legal EC Parallel Corpora (5, 19.17 MB)
|
|
68
|
-
│ ├── New Year Address_CE_2006-2021 (15, 147.49 KB)
|
|
69
|
-
│ ├── Sports News_CE_2010 (20, 66.42 KB)
|
|
70
|
-
│ ├── TED_EC_2017-2020 (330, 5.24 MB)
|
|
71
|
-
│ └── Xi's Speech_CE_2021 (1, 53.01 KB)
|
|
72
|
-
├── Stopwords (28, 88.09 KB)
|
|
73
|
-
└── Terminology (1, 2.20 MB)
|
|
74
|
-
|
|
75
|
-
...
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
Author: Pan Guisheng, a PhD student at the Graduate Institute of Interpretation and Translation of Shanghai International Studies University
|
|
79
|
-
E-mail: 895284504@qq.com
|
|
File without changes
|
|
File without changes
|
|
File without changes
|