PgsFile 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/HK-Press releases of the Financial Secretary Office (2007-2019).tsv +7348 -0
- PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/Hong Kong bilingual court decisions (1997-2017).tsv +20000 -0
- PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/HongKong-Legislation.tsv +20000 -0
- PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/Offering documents of financial products (updated as of October 2018).tsv +20000 -0
- PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/Speeches delivered by SFC Executives (2006-2019).tsv +4680 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2006.txt +46 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2008.txt +48 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2009.txt +42 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2010.txt +42 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2011.txt +38 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2012.txt +28 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2013.txt +42 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2014.txt +68 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2015.txt +106 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2016.txt +82 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2017.txt +90 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2018.txt +136 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2019.txt +112 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2020.txt +124 -0
- PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2021.txt +94 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100201_000150_en.txt +6 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100201_000150_zh.txt +6 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100213_000135_en.txt +17 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100213_000135_zh.txt +17 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100215_000445_en.txt +10 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100215_000445_zh.txt +10 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000135_en.txt +12 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000135_zh.txt +12 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000205_en.txt +5 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000205_zh.txt +5 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000548_en.txt +9 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000548_zh.txt +9 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100225_001011_en.txt +8 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100225_001011_zh.txt +8 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100227_000129_en.txt +8 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100227_000129_zh.txt +8 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100227_000649_en.txt +13 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100227_000649_zh.txt +13 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100301_000549_en.txt +8 -0
- PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100301_000549_zh.txt +8 -0
- PgsFile/Corpora/Corpora/Parallel/Xi's Speech_CE_2021/Speech at a Ceremony Marking the Centenary of the CPC.txt +144 -0
- PgsFile/PgsFile.py +516 -33
- PgsFile/__init__.py +13 -3
- PgsFile/models/NLPIR.user +0 -0
- PgsFile/models/fonts/DejaVuSans.ttf +0 -0
- PgsFile/models/fonts//321/204/342/225/243/320/266/321/204/342/225/234/320/243/321/205/320/255/320/232/321/210/342/225/241/342/225/241/321/204/342/225/243/320/255/321/206/342/226/222/320/257/321/211/320/242/320/262/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/204/342/225/234/320/243.ttf +0 -0
- PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/206/320/265/342/225/226/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
- PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
- PgsFile/models/fonts//321/205/320/235/320/252/321/206/342/224/244/320/233/321/210/320/261/320/234/321/204/342/225/243/320/2663500.TTF +0 -0
- PgsFile/models/fonts//321/211/320/251/320/226/321/206/320/257/320/274/321/204/342/225/243/320/233/321/210/320/261/320/234/321/204/342/225/243/320/266/321/205/320/275/320/247/321/204/342/225/234/320/243.ttf +0 -0
- PgsFile/models/model_reviews2.2.bin +0 -0
- PgsFile/models/model_reviews_ReadMe.txt +134 -0
- PgsFile-0.2.5.dist-info/METADATA +41 -0
- {PgsFile-0.2.3.dist-info → PgsFile-0.2.5.dist-info}/RECORD +57 -7
- PgsFile-0.2.3.dist-info/METADATA +0 -79
- {PgsFile-0.2.3.dist-info → PgsFile-0.2.5.dist-info}/LICENSE +0 -0
- {PgsFile-0.2.3.dist-info → PgsFile-0.2.5.dist-info}/WHEEL +0 -0
- {PgsFile-0.2.3.dist-info → PgsFile-0.2.5.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -103,7 +103,7 @@ def get_data_text(path):
|
|
|
103
103
|
else:
|
|
104
104
|
return None
|
|
105
105
|
|
|
106
|
-
def get_data_lines(path):
|
|
106
|
+
def get_data_lines(path, no_line_breaks=False):
|
|
107
107
|
'''
|
|
108
108
|
Parameters
|
|
109
109
|
----------
|
|
@@ -133,7 +133,10 @@ def get_data_lines(path):
|
|
|
133
133
|
# Read the entire file using the detected encoding
|
|
134
134
|
if encoding:
|
|
135
135
|
with open(path, 'r', encoding=encoding, errors="ignore") as f:
|
|
136
|
-
|
|
136
|
+
if no_line_breaks is False:
|
|
137
|
+
lines = [l.strip() for l in f.readlines() if len(l.strip()) != 0]
|
|
138
|
+
else:
|
|
139
|
+
lines = f.readlines()
|
|
137
140
|
return lines
|
|
138
141
|
else:
|
|
139
142
|
return None
|
|
@@ -197,13 +200,51 @@ def get_data_excel(excel_path,column_id,sheet_name=None):
|
|
|
197
200
|
inter=df.iloc[0:,column_id] #提取第二列所有行
|
|
198
201
|
return list(inter)
|
|
199
202
|
|
|
200
|
-
def write_to_excel(excel_path,
|
|
203
|
+
def write_to_excel(excel_path, data, sheet_name=None, index=None):
|
|
201
204
|
'''
|
|
202
205
|
Parameters
|
|
203
206
|
----------
|
|
204
207
|
excel_path : TYPE
|
|
205
208
|
DESCRIPTION. results.xlsx
|
|
206
209
|
|
|
210
|
+
data : TYPE, dict
|
|
211
|
+
DESCRIPTION. data = {'翻译': 24, '教学': 8, '数智': 6, '时代': 6, '财经': 6, '新': 4}
|
|
212
|
+
|
|
213
|
+
sheet_name : TYPE, optional
|
|
214
|
+
DESCRIPTION. The default is None.
|
|
215
|
+
|
|
216
|
+
index : TYPE, optional
|
|
217
|
+
DESCRIPTION. The default is None.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
None.
|
|
222
|
+
|
|
223
|
+
'''
|
|
224
|
+
import pandas as pd
|
|
225
|
+
if sheet_name is None:
|
|
226
|
+
sheet_name="sheet1"
|
|
227
|
+
else:
|
|
228
|
+
sheet_name=sheet_name
|
|
229
|
+
if index is None:
|
|
230
|
+
index=False
|
|
231
|
+
else:
|
|
232
|
+
index=True
|
|
233
|
+
|
|
234
|
+
col = list(data.keys())
|
|
235
|
+
freq = list(data.values())
|
|
236
|
+
dic_of_list={"items": col, "counts": freq}
|
|
237
|
+
|
|
238
|
+
df=pd.DataFrame(dic_of_list)
|
|
239
|
+
df.style.to_excel(excel_path, sheet_name=sheet_name,startcol=0, index=index)
|
|
240
|
+
|
|
241
|
+
def write_to_excel_normal(excel_path,dic_of_list,sheet_name=None,index=None):
|
|
242
|
+
'''
|
|
243
|
+
Parameters
|
|
244
|
+
----------
|
|
245
|
+
excel_path : TYPE
|
|
246
|
+
DESCRIPTION. D:\results.xlsx
|
|
247
|
+
|
|
207
248
|
dic_of_list : TYPE
|
|
208
249
|
DESCRIPTION. {"col":["a","b","c","d"],"freq":[1,2,3,4]}
|
|
209
250
|
|
|
@@ -471,6 +512,18 @@ def get_directory_tree_with_meta(start_path, indent='', show_meta=False, max_dir
|
|
|
471
512
|
print(f"{indent}└── ... (and {remaining_directories} more directories)")
|
|
472
513
|
# current_level=-1 will show all folders' info.
|
|
473
514
|
|
|
515
|
+
def get_full_path(*path_components):
|
|
516
|
+
"""
|
|
517
|
+
Combines multiple path components into a single, full path using os.path.join.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
*path_components: Variable number of path components (strings).
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
str: The combined full path.
|
|
524
|
+
"""
|
|
525
|
+
return os.path.join(*path_components)
|
|
526
|
+
|
|
474
527
|
def get_subfolder_path(parent_folder, subfolder_name):
|
|
475
528
|
import os
|
|
476
529
|
subfolder_name=subfolder_name.strip()
|
|
@@ -553,7 +606,6 @@ def batch_word_list(input_root):
|
|
|
553
606
|
sorted_words=sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
|
|
554
607
|
return sorted_words
|
|
555
608
|
|
|
556
|
-
|
|
557
609
|
def clean_list(meta):
|
|
558
610
|
"""
|
|
559
611
|
Parameters
|
|
@@ -576,7 +628,6 @@ def clean_list(meta):
|
|
|
576
628
|
|
|
577
629
|
yhd=["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)','Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0','Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)','Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1','Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13','Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6','NOKIA5700/ UCWEB7.0.2.37/28/999','Openwave/ UCWEB7.0.2.37/28/999','Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10','Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11','Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11','UCWEB7.0.2.37/28/999']
|
|
578
630
|
|
|
579
|
-
|
|
580
631
|
def source_path(relative_path):
|
|
581
632
|
import sys,os
|
|
582
633
|
if getattr(sys, 'frozen', False):
|
|
@@ -590,7 +641,6 @@ def next_folder_names(folder):
|
|
|
590
641
|
folder_namelist=next(os.walk(folder))[1]
|
|
591
642
|
return folder_namelist
|
|
592
643
|
|
|
593
|
-
|
|
594
644
|
def remove_empty_txts(folder_path):
|
|
595
645
|
import os
|
|
596
646
|
files=FilePath(folder_path)
|
|
@@ -797,7 +847,6 @@ def cs1(text):
|
|
|
797
847
|
sentences=sentences
|
|
798
848
|
return sentences
|
|
799
849
|
|
|
800
|
-
|
|
801
850
|
def word_tokenize(text, pos_tagged=False):
|
|
802
851
|
'''
|
|
803
852
|
Parameters
|
|
@@ -838,7 +887,7 @@ def word_tokenize(text, pos_tagged=False):
|
|
|
838
887
|
else:
|
|
839
888
|
try:
|
|
840
889
|
if "Can not open" in str(err):
|
|
841
|
-
user_folder=get_library_location("PgsFile")+"/PgsFile/
|
|
890
|
+
user_folder=get_library_location("PgsFile")+"/PgsFile/models"
|
|
842
891
|
destination_folder=get_library_location("nlpir-python")+"/nlpir/Data"
|
|
843
892
|
source_file=find_user_files_in_upper_folder(user_folder, "NLPIR")[0]
|
|
844
893
|
copy_file(source_file, destination_folder)
|
|
@@ -850,6 +899,296 @@ def word_tokenize(text, pos_tagged=False):
|
|
|
850
899
|
|
|
851
900
|
return words
|
|
852
901
|
|
|
902
|
+
import re
|
|
903
|
+
from abc import ABC, abstractmethod
|
|
904
|
+
from typing import Iterator, List, Tuple
|
|
905
|
+
class TokenizerI(ABC):
|
|
906
|
+
"""
|
|
907
|
+
A processing interface for tokenizing a string.
|
|
908
|
+
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
|
|
909
|
+
"""
|
|
910
|
+
|
|
911
|
+
@abstractmethod
|
|
912
|
+
def tokenize(self, s: str) -> List[str]:
|
|
913
|
+
"""
|
|
914
|
+
Return a tokenized copy of *s*.
|
|
915
|
+
|
|
916
|
+
:rtype: List[str]
|
|
917
|
+
"""
|
|
918
|
+
if overridden(self.tokenize_sents):
|
|
919
|
+
return self.tokenize_sents([s])[0]
|
|
920
|
+
|
|
921
|
+
def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
|
|
922
|
+
"""
|
|
923
|
+
Identify the tokens using integer offsets ``(start_i, end_i)``,
|
|
924
|
+
where ``s[start_i:end_i]`` is the corresponding token.
|
|
925
|
+
|
|
926
|
+
:rtype: Iterator[Tuple[int, int]]
|
|
927
|
+
"""
|
|
928
|
+
raise NotImplementedError()
|
|
929
|
+
|
|
930
|
+
def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
|
|
931
|
+
"""
|
|
932
|
+
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
|
|
933
|
+
|
|
934
|
+
return [self.tokenize(s) for s in strings]
|
|
935
|
+
|
|
936
|
+
:rtype: List[List[str]]
|
|
937
|
+
"""
|
|
938
|
+
return [self.tokenize(s) for s in strings]
|
|
939
|
+
|
|
940
|
+
def span_tokenize_sents(
|
|
941
|
+
self, strings: List[str]
|
|
942
|
+
) -> Iterator[List[Tuple[int, int]]]:
|
|
943
|
+
"""
|
|
944
|
+
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
|
|
945
|
+
|
|
946
|
+
return [self.span_tokenize(s) for s in strings]
|
|
947
|
+
|
|
948
|
+
:yield: List[Tuple[int, int]]
|
|
949
|
+
"""
|
|
950
|
+
for s in strings:
|
|
951
|
+
yield list(self.span_tokenize(s))
|
|
952
|
+
|
|
953
|
+
class MacIntyreContractions:
|
|
954
|
+
"""
|
|
955
|
+
List of contractions adapted from Robert MacIntyre's tokenizer.
|
|
956
|
+
"""
|
|
957
|
+
|
|
958
|
+
CONTRACTIONS2 = [
|
|
959
|
+
r"(?i)\b(can)(?#X)(not)\b",
|
|
960
|
+
r"(?i)\b(d)(?#X)('ye)\b",
|
|
961
|
+
r"(?i)\b(gim)(?#X)(me)\b",
|
|
962
|
+
r"(?i)\b(gon)(?#X)(na)\b",
|
|
963
|
+
r"(?i)\b(got)(?#X)(ta)\b",
|
|
964
|
+
r"(?i)\b(lem)(?#X)(me)\b",
|
|
965
|
+
r"(?i)\b(more)(?#X)('n)\b",
|
|
966
|
+
r"(?i)\b(wan)(?#X)(na)(?=\s)",
|
|
967
|
+
]
|
|
968
|
+
CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
|
|
969
|
+
CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
|
|
970
|
+
|
|
971
|
+
class NLTKWordTokenizer(TokenizerI):
|
|
972
|
+
"""
|
|
973
|
+
The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
|
|
974
|
+
|
|
975
|
+
This is the method that is invoked by ``word_tokenize()``. It assumes that the
|
|
976
|
+
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
|
|
977
|
+
|
|
978
|
+
The tokenizer is "destructive" such that the regexes applied will munge the
|
|
979
|
+
input string to a state beyond re-construction. It is possible to apply
|
|
980
|
+
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of
|
|
981
|
+
`NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
|
|
982
|
+
revert to the original string.
|
|
983
|
+
"""
|
|
984
|
+
|
|
985
|
+
# Starting quotes.
|
|
986
|
+
STARTING_QUOTES = [
|
|
987
|
+
(re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
|
|
988
|
+
(re.compile(r"^\""), r"``"),
|
|
989
|
+
(re.compile(r"(``)"), r" \1 "),
|
|
990
|
+
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
|
|
991
|
+
(re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
|
|
992
|
+
]
|
|
993
|
+
|
|
994
|
+
# Ending quotes.
|
|
995
|
+
ENDING_QUOTES = [
|
|
996
|
+
(re.compile("([»”’])", re.U), r" \1 "),
|
|
997
|
+
(re.compile(r"''"), " '' "),
|
|
998
|
+
(re.compile(r'"'), " '' "),
|
|
999
|
+
(re.compile(r"\s+"), " "),
|
|
1000
|
+
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
|
|
1001
|
+
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
|
|
1002
|
+
]
|
|
1003
|
+
|
|
1004
|
+
# For improvements for starting/closing quotes from TreebankWordTokenizer,
|
|
1005
|
+
# see discussion on https://github.com/nltk/nltk/pull/1437
|
|
1006
|
+
# Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
|
|
1007
|
+
# - chevron quotes u'\xab' and u'\xbb'
|
|
1008
|
+
# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
|
|
1009
|
+
# See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
|
|
1010
|
+
# Also, behavior of splitting on clitics now follows Stanford CoreNLP
|
|
1011
|
+
# - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
|
|
1012
|
+
|
|
1013
|
+
# Punctuation.
|
|
1014
|
+
PUNCTUATION = [
|
|
1015
|
+
(re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
|
|
1016
|
+
(re.compile(r"([:,])([^\d])"), r" \1 \2"),
|
|
1017
|
+
(re.compile(r"([:,])$"), r" \1 "),
|
|
1018
|
+
(
|
|
1019
|
+
re.compile(r"\.{2,}", re.U),
|
|
1020
|
+
r" \g<0> ",
|
|
1021
|
+
), # See https://github.com/nltk/nltk/pull/2322
|
|
1022
|
+
(re.compile(r"[;@#$%&]"), r" \g<0> "),
|
|
1023
|
+
(
|
|
1024
|
+
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
|
|
1025
|
+
r"\1 \2\3 ",
|
|
1026
|
+
), # Handles the final period.
|
|
1027
|
+
(re.compile(r"[?!]"), r" \g<0> "),
|
|
1028
|
+
(re.compile(r"([^'])' "), r"\1 ' "),
|
|
1029
|
+
(
|
|
1030
|
+
re.compile(r"[*]", re.U),
|
|
1031
|
+
r" \g<0> ",
|
|
1032
|
+
), # See https://github.com/nltk/nltk/pull/2322
|
|
1033
|
+
]
|
|
1034
|
+
|
|
1035
|
+
# Pads parentheses
|
|
1036
|
+
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
|
|
1037
|
+
|
|
1038
|
+
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
|
|
1039
|
+
CONVERT_PARENTHESES = [
|
|
1040
|
+
(re.compile(r"\("), "-LRB-"),
|
|
1041
|
+
(re.compile(r"\)"), "-RRB-"),
|
|
1042
|
+
(re.compile(r"\["), "-LSB-"),
|
|
1043
|
+
(re.compile(r"\]"), "-RSB-"),
|
|
1044
|
+
(re.compile(r"\{"), "-LCB-"),
|
|
1045
|
+
(re.compile(r"\}"), "-RCB-"),
|
|
1046
|
+
]
|
|
1047
|
+
|
|
1048
|
+
DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
|
|
1049
|
+
|
|
1050
|
+
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
|
1051
|
+
_contractions = MacIntyreContractions()
|
|
1052
|
+
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
|
|
1053
|
+
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
|
|
1054
|
+
|
|
1055
|
+
def tokenize(
|
|
1056
|
+
self, text: str, convert_parentheses: bool = False, return_str: bool = False
|
|
1057
|
+
) -> List[str]:
|
|
1058
|
+
r"""Return a tokenized copy of `text`.
|
|
1059
|
+
|
|
1060
|
+
>>> from nltk.tokenize import NLTKWordTokenizer
|
|
1061
|
+
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
|
1062
|
+
>>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
|
1063
|
+
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
|
|
1064
|
+
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
|
1065
|
+
'of', 'them.', 'Thanks', '.']
|
|
1066
|
+
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
|
|
1067
|
+
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
|
|
1068
|
+
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
|
1069
|
+
'of', 'them.', 'Thanks', '.']
|
|
1070
|
+
|
|
1071
|
+
|
|
1072
|
+
:param text: A string with a sentence or sentences.
|
|
1073
|
+
:type text: str
|
|
1074
|
+
:param convert_parentheses: if True, replace parentheses to PTB symbols,
|
|
1075
|
+
e.g. `(` to `-LRB-`. Defaults to False.
|
|
1076
|
+
:type convert_parentheses: bool, optional
|
|
1077
|
+
:param return_str: If True, return tokens as space-separated string,
|
|
1078
|
+
defaults to False.
|
|
1079
|
+
:type return_str: bool, optional
|
|
1080
|
+
:return: List of tokens from `text`.
|
|
1081
|
+
:rtype: List[str]
|
|
1082
|
+
"""
|
|
1083
|
+
if return_str:
|
|
1084
|
+
warnings.warn(
|
|
1085
|
+
"Parameter 'return_str' has been deprecated and should no "
|
|
1086
|
+
"longer be used.",
|
|
1087
|
+
category=DeprecationWarning,
|
|
1088
|
+
stacklevel=2,
|
|
1089
|
+
)
|
|
1090
|
+
|
|
1091
|
+
for regexp, substitution in self.STARTING_QUOTES:
|
|
1092
|
+
text = regexp.sub(substitution, text)
|
|
1093
|
+
|
|
1094
|
+
for regexp, substitution in self.PUNCTUATION:
|
|
1095
|
+
text = regexp.sub(substitution, text)
|
|
1096
|
+
|
|
1097
|
+
# Handles parentheses.
|
|
1098
|
+
regexp, substitution = self.PARENS_BRACKETS
|
|
1099
|
+
text = regexp.sub(substitution, text)
|
|
1100
|
+
# Optionally convert parentheses
|
|
1101
|
+
if convert_parentheses:
|
|
1102
|
+
for regexp, substitution in self.CONVERT_PARENTHESES:
|
|
1103
|
+
text = regexp.sub(substitution, text)
|
|
1104
|
+
|
|
1105
|
+
# Handles double dash.
|
|
1106
|
+
regexp, substitution = self.DOUBLE_DASHES
|
|
1107
|
+
text = regexp.sub(substitution, text)
|
|
1108
|
+
|
|
1109
|
+
# add extra space to make things easier
|
|
1110
|
+
text = " " + text + " "
|
|
1111
|
+
|
|
1112
|
+
for regexp, substitution in self.ENDING_QUOTES:
|
|
1113
|
+
text = regexp.sub(substitution, text)
|
|
1114
|
+
|
|
1115
|
+
for regexp in self.CONTRACTIONS2:
|
|
1116
|
+
text = regexp.sub(r" \1 \2 ", text)
|
|
1117
|
+
for regexp in self.CONTRACTIONS3:
|
|
1118
|
+
text = regexp.sub(r" \1 \2 ", text)
|
|
1119
|
+
|
|
1120
|
+
# We are not using CONTRACTIONS4 since
|
|
1121
|
+
# they are also commented out in the SED scripts
|
|
1122
|
+
# for regexp in self._contractions.CONTRACTIONS4:
|
|
1123
|
+
# text = regexp.sub(r' \1 \2 \3 ', text)
|
|
1124
|
+
|
|
1125
|
+
return text.split()
|
|
1126
|
+
|
|
1127
|
+
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
|
|
1128
|
+
r"""
|
|
1129
|
+
Returns the spans of the tokens in ``text``.
|
|
1130
|
+
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
|
|
1131
|
+
|
|
1132
|
+
>>> from nltk.tokenize import NLTKWordTokenizer
|
|
1133
|
+
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
|
1134
|
+
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
|
|
1135
|
+
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
|
|
1136
|
+
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
|
|
1137
|
+
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
|
|
1138
|
+
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
|
|
1139
|
+
True
|
|
1140
|
+
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
|
1141
|
+
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
|
|
1142
|
+
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
|
|
1143
|
+
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
|
|
1144
|
+
True
|
|
1145
|
+
|
|
1146
|
+
:param text: A string with a sentence or sentences.
|
|
1147
|
+
:type text: str
|
|
1148
|
+
:yield: Tuple[int, int]
|
|
1149
|
+
"""
|
|
1150
|
+
raw_tokens = self.tokenize(text)
|
|
1151
|
+
|
|
1152
|
+
# Convert converted quotes back to original double quotes
|
|
1153
|
+
# Do this only if original text contains double quote(s) or double
|
|
1154
|
+
# single-quotes (because '' might be transformed to `` if it is
|
|
1155
|
+
# treated as starting quotes).
|
|
1156
|
+
if ('"' in text) or ("''" in text):
|
|
1157
|
+
# Find double quotes and converted quotes
|
|
1158
|
+
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
|
|
1159
|
+
|
|
1160
|
+
# Replace converted quotes back to double quotes
|
|
1161
|
+
tokens = [
|
|
1162
|
+
matched.pop(0) if tok in ['"', "``", "''"] else tok
|
|
1163
|
+
for tok in raw_tokens
|
|
1164
|
+
]
|
|
1165
|
+
else:
|
|
1166
|
+
tokens = raw_tokens
|
|
1167
|
+
|
|
1168
|
+
yield from align_tokens(tokens, text)
|
|
1169
|
+
|
|
1170
|
+
# Standard word tokenizer.
|
|
1171
|
+
_treebank_word_tokenizer = NLTKWordTokenizer()
|
|
1172
|
+
def word_tokenize2(text, preserve_line=False):
|
|
1173
|
+
"""
|
|
1174
|
+
Return a tokenized copy of *text*,
|
|
1175
|
+
using NLTK's recommended word tokenizer
|
|
1176
|
+
(currently an improved :class:`.TreebankWordTokenizer`
|
|
1177
|
+
along with :class:`.PunktSentenceTokenizer`
|
|
1178
|
+
for the specified language).
|
|
1179
|
+
|
|
1180
|
+
:param text: text to split into words
|
|
1181
|
+
:type text: str
|
|
1182
|
+
:param language: the model name in the Punkt corpus
|
|
1183
|
+
:type language: str
|
|
1184
|
+
:param preserve_line: A flag to decide whether to sentence tokenize the text or not.
|
|
1185
|
+
:type preserve_line: bool
|
|
1186
|
+
"""
|
|
1187
|
+
sentences = [text] if preserve_line else sent_tokenize(text)
|
|
1188
|
+
return [
|
|
1189
|
+
token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
|
|
1190
|
+
]
|
|
1191
|
+
|
|
853
1192
|
def pad_sequence(
|
|
854
1193
|
sequence,
|
|
855
1194
|
n,
|
|
@@ -889,9 +1228,7 @@ def pad_sequence(
|
|
|
889
1228
|
sequence=chain(sequence, (right_pad_symbol,) * (n - 1))
|
|
890
1229
|
return sequence
|
|
891
1230
|
|
|
892
|
-
|
|
893
1231
|
# add a flag to pad the sequence so we get peripheral ngrams?
|
|
894
|
-
|
|
895
1232
|
def ngrams(
|
|
896
1233
|
sequence,
|
|
897
1234
|
n,
|
|
@@ -954,7 +1291,6 @@ def ngrams(
|
|
|
954
1291
|
yield tuple(history)
|
|
955
1292
|
del history[0]
|
|
956
1293
|
|
|
957
|
-
|
|
958
1294
|
def bigrams(sequence, **kwargs):
|
|
959
1295
|
"""
|
|
960
1296
|
Return the bigrams generated from a sequence of items, as an iterator.
|
|
@@ -974,7 +1310,6 @@ def bigrams(sequence, **kwargs):
|
|
|
974
1310
|
for item in ngrams(sequence, 2, **kwargs):
|
|
975
1311
|
yield item
|
|
976
1312
|
|
|
977
|
-
|
|
978
1313
|
def trigrams(sequence, **kwargs):
|
|
979
1314
|
"""
|
|
980
1315
|
Return the trigrams generated from a sequence of items, as an iterator.
|
|
@@ -994,7 +1329,6 @@ def trigrams(sequence, **kwargs):
|
|
|
994
1329
|
for item in ngrams(sequence, 3, **kwargs):
|
|
995
1330
|
yield item
|
|
996
1331
|
|
|
997
|
-
|
|
998
1332
|
def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
|
|
999
1333
|
"""
|
|
1000
1334
|
Returns all possible ngrams generated from a sequence of items, as an iterator.
|
|
@@ -1148,6 +1482,18 @@ def uninstall_package(package_name: str):
|
|
|
1148
1482
|
import pip
|
|
1149
1483
|
pip.main(['uninstall', package_name, '-y'])
|
|
1150
1484
|
|
|
1485
|
+
# A list of conda configuration commands.
|
|
1486
|
+
conda_mirror_commands=[
|
|
1487
|
+
"pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple", # Windows recommended
|
|
1488
|
+
"conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/", # MacOS recommended
|
|
1489
|
+
"conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/",
|
|
1490
|
+
"conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/",
|
|
1491
|
+
"conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/fastai/",
|
|
1492
|
+
"conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/",
|
|
1493
|
+
"conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/",
|
|
1494
|
+
"pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/"
|
|
1495
|
+
]
|
|
1496
|
+
|
|
1151
1497
|
def DirList(root_dir: str) -> tuple:
|
|
1152
1498
|
"""
|
|
1153
1499
|
List the contents of a directory and return two lists containing the names of the directories and files in the directory.
|
|
@@ -1251,10 +1597,8 @@ def sort_strings_with_embedded_numbers(strings: list) -> list:
|
|
|
1251
1597
|
"""
|
|
1252
1598
|
# Sort the strings using the extract_numbers() function as the key
|
|
1253
1599
|
sorted_strings=sorted(strings, key=extract_numbers)
|
|
1254
|
-
|
|
1255
1600
|
return sorted_strings
|
|
1256
1601
|
|
|
1257
|
-
|
|
1258
1602
|
def run_command(command: str) -> str:
|
|
1259
1603
|
"""
|
|
1260
1604
|
Run a command and return its output as a string.
|
|
@@ -1444,7 +1788,6 @@ class PGScraper(object):
|
|
|
1444
1788
|
return all_want_list
|
|
1445
1789
|
|
|
1446
1790
|
|
|
1447
|
-
|
|
1448
1791
|
|
|
1449
1792
|
# -*- coding: utf-8 -*-
|
|
1450
1793
|
"""
|
|
@@ -1490,7 +1833,6 @@ class ProgressData(object):
|
|
|
1490
1833
|
self.size, self.unit, progress, speed, self.unit))
|
|
1491
1834
|
print('%50s'%('/'*int((1-progress)*50)))
|
|
1492
1835
|
|
|
1493
|
-
|
|
1494
1836
|
def levenshtein_distance(s, t):
|
|
1495
1837
|
m, n=len(s), len(t)
|
|
1496
1838
|
if m < n:
|
|
@@ -1516,7 +1858,7 @@ pgs_abbres_words=['A.B.','A.D.','A.G.','A.I.','A.M.','A.P.','A.V.','AFP.','Ala.'
|
|
|
1516
1858
|
def clean_text(text): #清洗除了句号以外的其他标点符号问题
|
|
1517
1859
|
# 在标点符号右边邻接单词前添加空格
|
|
1518
1860
|
import re
|
|
1519
|
-
text=replace_chinese_punctuation_with_english(text)
|
|
1861
|
+
# text=replace_chinese_punctuation_with_english(text)
|
|
1520
1862
|
text=re.sub(r'(?<=[\?\!\,\;\:\)\]\}])\s*(?=\w)', ' ', text)
|
|
1521
1863
|
# 删除标点符号与左边单词之间的空格
|
|
1522
1864
|
text=re.sub(r'\s*([\?\!\,\;\:\)\]\}\>])', r'\1', text)
|
|
@@ -1532,24 +1874,45 @@ def clean_text(text): #清洗除了句号以外的其他标点符号问题
|
|
|
1532
1874
|
|
|
1533
1875
|
def clean_text_with_abbreviations(text):
|
|
1534
1876
|
import re
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1877
|
+
|
|
1878
|
+
# 按行分割文本
|
|
1879
|
+
lines = text.splitlines()
|
|
1880
|
+
|
|
1881
|
+
# 清洗每一行
|
|
1882
|
+
cleaned_lines = []
|
|
1883
|
+
for line in lines:
|
|
1884
|
+
cleaned_line = clean_line_with_abbreviations(line)
|
|
1885
|
+
cleaned_lines.append(cleaned_line)
|
|
1886
|
+
|
|
1887
|
+
# 将清洗后的行重新组合成文本
|
|
1888
|
+
cleaned_text = '\n'.join(cleaned_lines)
|
|
1889
|
+
return cleaned_text
|
|
1890
|
+
|
|
1891
|
+
def clean_line_with_abbreviations(line):
|
|
1892
|
+
import re
|
|
1893
|
+
|
|
1894
|
+
# 清洗除了句号以外的其他标点符号问题
|
|
1895
|
+
line = clean_text(line)
|
|
1896
|
+
|
|
1897
|
+
matches = []
|
|
1898
|
+
for seg in line.split():
|
|
1538
1899
|
if "." in seg:
|
|
1539
|
-
if seg.endswith(".")
|
|
1900
|
+
if not seg.endswith("."):
|
|
1540
1901
|
matches.append(seg)
|
|
1541
1902
|
elif seg.endswith("..") and "..." not in seg:
|
|
1542
|
-
|
|
1543
|
-
|
|
1903
|
+
line = line.replace("..", ".")
|
|
1904
|
+
|
|
1544
1905
|
for match in matches:
|
|
1545
1906
|
if any(word in match for word in pgs_abbres_words):
|
|
1546
|
-
inter=match.split(".")
|
|
1547
|
-
new_match="".join([w+"." for w in inter[0:-1]])+" "+inter[-1]
|
|
1548
|
-
|
|
1907
|
+
inter = match.split(".")
|
|
1908
|
+
new_match = "".join([w + "." for w in inter[0:-1]]) + " " + inter[-1]
|
|
1909
|
+
line = line.replace(match, new_match)
|
|
1549
1910
|
else:
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1911
|
+
line = line.replace(match, match.replace(".", ". "))
|
|
1912
|
+
|
|
1913
|
+
line = re.sub(r'\s+\.', '.', line)
|
|
1914
|
+
return line
|
|
1915
|
+
|
|
1553
1916
|
|
|
1554
1917
|
import shutil
|
|
1555
1918
|
def move_file(source_file, destination_folder, new_file_name=None):
|
|
@@ -1597,7 +1960,6 @@ def copy_file(source_file, destination_folder, new_file_name=None):
|
|
|
1597
1960
|
|
|
1598
1961
|
# Copy the file to the destination folder
|
|
1599
1962
|
shutil.copy2(source_file, destination_file)
|
|
1600
|
-
|
|
1601
1963
|
|
|
1602
1964
|
def check_empty_cells(file_path):
|
|
1603
1965
|
"""
|
|
@@ -1636,7 +1998,6 @@ def makefile(file_path):
|
|
|
1636
1998
|
else:
|
|
1637
1999
|
write_to_txt(file_path, "")
|
|
1638
2000
|
|
|
1639
|
-
|
|
1640
2001
|
def save_dict_to_excel(data, output_file, headers=None):
|
|
1641
2002
|
"""
|
|
1642
2003
|
Save Python dictionary data into an Excel .xlsx file with custom headers.
|
|
@@ -1845,4 +2206,126 @@ def get_stopwords(language=None):
|
|
|
1845
2206
|
return en_stopwords
|
|
1846
2207
|
else:
|
|
1847
2208
|
lang_stopwords=get_data_lines(find_txt_files_with_keyword(stopwords_path, language)[0])
|
|
1848
|
-
return lang_stopwords
|
|
2209
|
+
return lang_stopwords
|
|
2210
|
+
|
|
2211
|
+
from PIL import Image
|
|
2212
|
+
def replace_white_with_transparency(input_path, output_path):
|
|
2213
|
+
"""
|
|
2214
|
+
This function opens an image, replaces all white pixels with transparent pixels.
|
|
2215
|
+
|
|
2216
|
+
Parameters:
|
|
2217
|
+
input_path (str): The path to the input image file.
|
|
2218
|
+
output_path (str): The path to save the output image file.
|
|
2219
|
+
"""
|
|
2220
|
+
# 从RGB(24位)模式转成RGBA(32位)模式
|
|
2221
|
+
img = Image.open(input_path).convert('RGBA')
|
|
2222
|
+
W, L = img.size
|
|
2223
|
+
white_pixel = (0, 0, 0, 0) # white
|
|
2224
|
+
for h in range(W):
|
|
2225
|
+
for i in range(L):
|
|
2226
|
+
if img.getpixel((h, i)) == white_pixel:
|
|
2227
|
+
img.putpixel((h, i), (255, 255, 255, 0)) # make it transparent
|
|
2228
|
+
img.save(output_path)
|
|
2229
|
+
|
|
2230
|
+
def get_font_path(font_name=None):
|
|
2231
|
+
'''
|
|
2232
|
+
Retrieves the file path of a specified font.
|
|
2233
|
+
|
|
2234
|
+
Parameters
|
|
2235
|
+
----------
|
|
2236
|
+
font_name : str, optional
|
|
2237
|
+
The name of the font file (must end with ".ttf"). If provided, it should match one of the available fonts in the library, such as:
|
|
2238
|
+
- 'DejaVuSans.ttf'
|
|
2239
|
+
- '书体坊赵九江钢笔行书体.ttf'
|
|
2240
|
+
- '全新硬笔楷书简.ttf'
|
|
2241
|
+
- '全新硬笔行书简.ttf'
|
|
2242
|
+
- '博洋行书3500.TTF'
|
|
2243
|
+
- '陆柬之行书字体.ttf'
|
|
2244
|
+
The default is None, which will return the path for 'DejaVuSans.ttf'.
|
|
2245
|
+
|
|
2246
|
+
Returns
|
|
2247
|
+
-------
|
|
2248
|
+
font_path : str
|
|
2249
|
+
The full file path of the specified font. If no font name is provided, the default path for 'DejaVuSans.ttf' will be returned.
|
|
2250
|
+
Example: "C:/Windows/Fonts/simhei.ttf"
|
|
2251
|
+
'''
|
|
2252
|
+
|
|
2253
|
+
font_folder = get_library_location("PgsFile") + "/PgsFile/models/fonts"
|
|
2254
|
+
if font_name is None:
|
|
2255
|
+
font_path = get_full_path(font_folder, "DejaVuSans.ttf")
|
|
2256
|
+
else:
|
|
2257
|
+
font_path = get_full_path(font_folder, font_name)
|
|
2258
|
+
return font_path
|
|
2259
|
+
|
|
2260
|
+
simhei_default_font_path_MacOS_Windows=["/System/Library/Fonts/STHeiti Medium.ttc",
|
|
2261
|
+
r"C:\Windows\Fonts\simhei.ttf", # Use a font that supports Chinese characters
|
|
2262
|
+
]
|
|
2263
|
+
|
|
2264
|
+
|
|
2265
|
+
def get_env_variable(variable_name):
|
|
2266
|
+
# Get the value of the specified environment variable
|
|
2267
|
+
value = os.getenv(variable_name)
|
|
2268
|
+
|
|
2269
|
+
# Check if the environment variable is set
|
|
2270
|
+
if value is not None:
|
|
2271
|
+
print(f"{variable_name} is set to: {value}")
|
|
2272
|
+
else:
|
|
2273
|
+
print(f"{variable_name} is not set.")
|
|
2274
|
+
|
|
2275
|
+
import subprocess
|
|
2276
|
+
def set_permanent_environment_variable(variable_name, variable_value, system_wide=False):
|
|
2277
|
+
"""
|
|
2278
|
+
Sets a permanent environment variable on Windows using the `setx` command.
|
|
2279
|
+
|
|
2280
|
+
Args:
|
|
2281
|
+
variable_name (str): The name of the environment variable.
|
|
2282
|
+
variable_value (str): The value to set for the environment variable.
|
|
2283
|
+
system_wide (bool): If True, sets the variable system-wide (requires admin privileges).
|
|
2284
|
+
If False, sets the variable for the current user only.
|
|
2285
|
+
"""
|
|
2286
|
+
try:
|
|
2287
|
+
# Construct the setx command
|
|
2288
|
+
command = ['setx', variable_name, variable_value]
|
|
2289
|
+
if system_wide:
|
|
2290
|
+
command.append('/M') # Add /M flag for system-wide variables
|
|
2291
|
+
|
|
2292
|
+
# Run the command
|
|
2293
|
+
subprocess.run(command, shell=True, check=True)
|
|
2294
|
+
|
|
2295
|
+
print(f'Permanent environment variable {variable_name} set to {variable_value} '
|
|
2296
|
+
f'({"system-wide" if system_wide else "user-level"}).')
|
|
2297
|
+
except subprocess.CalledProcessError as e:
|
|
2298
|
+
print(f'Failed to set environment variable: {e}')
|
|
2299
|
+
except Exception as e:
|
|
2300
|
+
print(f'An error occurred: {e}')
|
|
2301
|
+
|
|
2302
|
+
def delete_permanent_environment_variable(variable_name, system_wide=False):
|
|
2303
|
+
"""
|
|
2304
|
+
Deletes a permanent environment variable on Windows using the `reg` command.
|
|
2305
|
+
|
|
2306
|
+
Args:
|
|
2307
|
+
variable_name (str): The name of the environment variable to delete.
|
|
2308
|
+
system_wide (bool): If True, deletes the variable system-wide (requires admin privileges).
|
|
2309
|
+
If False, deletes the variable for the current user only.
|
|
2310
|
+
"""
|
|
2311
|
+
try:
|
|
2312
|
+
# Determine the registry key based on the scope
|
|
2313
|
+
if system_wide:
|
|
2314
|
+
reg_key = r'HKLM\SYSTEM\CurrentControlSet\Control\Session Manager\Environment'
|
|
2315
|
+
else:
|
|
2316
|
+
reg_key = r'HKCU\Environment'
|
|
2317
|
+
|
|
2318
|
+
# Run the `reg delete` command to remove the variable
|
|
2319
|
+
subprocess.run(
|
|
2320
|
+
['reg', 'delete', reg_key, '/v', variable_name, '/f'],
|
|
2321
|
+
shell=True,
|
|
2322
|
+
check=True
|
|
2323
|
+
)
|
|
2324
|
+
|
|
2325
|
+
print(f'Permanent environment variable {variable_name} deleted '
|
|
2326
|
+
f'({"system-wide" if system_wide else "user-level"}).')
|
|
2327
|
+
except subprocess.CalledProcessError as e:
|
|
2328
|
+
print(f'Failed to delete environment variable: {e}')
|
|
2329
|
+
except Exception as e:
|
|
2330
|
+
print(f'An error occurred: {e}')
|
|
2331
|
+
|