PgsFile 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

Files changed (58) hide show
  1. PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/HK-Press releases of the Financial Secretary Office (2007-2019).tsv +7348 -0
  2. PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/Hong Kong bilingual court decisions (1997-2017).tsv +20000 -0
  3. PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/HongKong-Legislation.tsv +20000 -0
  4. PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/Offering documents of financial products (updated as of October 2018).tsv +20000 -0
  5. PgsFile/Corpora/Corpora/Parallel/HK Financial and Legal EC Parallel Corpora/Speeches delivered by SFC Executives (2006-2019).tsv +4680 -0
  6. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2006.txt +46 -0
  7. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2008.txt +48 -0
  8. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2009.txt +42 -0
  9. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2010.txt +42 -0
  10. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2011.txt +38 -0
  11. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2012.txt +28 -0
  12. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2013.txt +42 -0
  13. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2014.txt +68 -0
  14. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2015.txt +106 -0
  15. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2016.txt +82 -0
  16. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2017.txt +90 -0
  17. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2018.txt +136 -0
  18. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2019.txt +112 -0
  19. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2020.txt +124 -0
  20. PgsFile/Corpora/Corpora/Parallel/New Year Address_CE_2006-2021/2021.txt +94 -0
  21. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100201_000150_en.txt +6 -0
  22. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100201_000150_zh.txt +6 -0
  23. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100213_000135_en.txt +17 -0
  24. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100213_000135_zh.txt +17 -0
  25. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100215_000445_en.txt +10 -0
  26. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100215_000445_zh.txt +10 -0
  27. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000135_en.txt +12 -0
  28. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000135_zh.txt +12 -0
  29. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000205_en.txt +5 -0
  30. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000205_zh.txt +5 -0
  31. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000548_en.txt +9 -0
  32. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100222_000548_zh.txt +9 -0
  33. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100225_001011_en.txt +8 -0
  34. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100225_001011_zh.txt +8 -0
  35. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100227_000129_en.txt +8 -0
  36. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100227_000129_zh.txt +8 -0
  37. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100227_000649_en.txt +13 -0
  38. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100227_000649_zh.txt +13 -0
  39. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100301_000549_en.txt +8 -0
  40. PgsFile/Corpora/Corpora/Parallel/Sports News_CE_2010/20100301_000549_zh.txt +8 -0
  41. PgsFile/Corpora/Corpora/Parallel/Xi's Speech_CE_2021/Speech at a Ceremony Marking the Centenary of the CPC.txt +144 -0
  42. PgsFile/PgsFile.py +516 -33
  43. PgsFile/__init__.py +13 -3
  44. PgsFile/models/NLPIR.user +0 -0
  45. PgsFile/models/fonts/DejaVuSans.ttf +0 -0
  46. PgsFile/models/fonts//321/204/342/225/243/320/266/321/204/342/225/234/320/243/321/205/320/255/320/232/321/210/342/225/241/342/225/241/321/204/342/225/243/320/255/321/206/342/226/222/320/257/321/211/320/242/320/262/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/204/342/225/234/320/243.ttf +0 -0
  47. PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/206/320/265/342/225/226/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
  48. PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
  49. PgsFile/models/fonts//321/205/320/235/320/252/321/206/342/224/244/320/233/321/210/320/261/320/234/321/204/342/225/243/320/2663500.TTF +0 -0
  50. PgsFile/models/fonts//321/211/320/251/320/226/321/206/320/257/320/274/321/204/342/225/243/320/233/321/210/320/261/320/234/321/204/342/225/243/320/266/321/205/320/275/320/247/321/204/342/225/234/320/243.ttf +0 -0
  51. PgsFile/models/model_reviews2.2.bin +0 -0
  52. PgsFile/models/model_reviews_ReadMe.txt +134 -0
  53. PgsFile-0.2.5.dist-info/METADATA +41 -0
  54. {PgsFile-0.2.3.dist-info → PgsFile-0.2.5.dist-info}/RECORD +57 -7
  55. PgsFile-0.2.3.dist-info/METADATA +0 -79
  56. {PgsFile-0.2.3.dist-info → PgsFile-0.2.5.dist-info}/LICENSE +0 -0
  57. {PgsFile-0.2.3.dist-info → PgsFile-0.2.5.dist-info}/WHEEL +0 -0
  58. {PgsFile-0.2.3.dist-info → PgsFile-0.2.5.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py CHANGED
@@ -103,7 +103,7 @@ def get_data_text(path):
103
103
  else:
104
104
  return None
105
105
 
106
- def get_data_lines(path):
106
+ def get_data_lines(path, no_line_breaks=False):
107
107
  '''
108
108
  Parameters
109
109
  ----------
@@ -133,7 +133,10 @@ def get_data_lines(path):
133
133
  # Read the entire file using the detected encoding
134
134
  if encoding:
135
135
  with open(path, 'r', encoding=encoding, errors="ignore") as f:
136
- lines = [l.strip() for l in f.readlines() if len(l.strip()) != 0]
136
+ if no_line_breaks is False:
137
+ lines = [l.strip() for l in f.readlines() if len(l.strip()) != 0]
138
+ else:
139
+ lines = f.readlines()
137
140
  return lines
138
141
  else:
139
142
  return None
@@ -197,13 +200,51 @@ def get_data_excel(excel_path,column_id,sheet_name=None):
197
200
  inter=df.iloc[0:,column_id] #提取第二列所有行
198
201
  return list(inter)
199
202
 
200
- def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
203
+ def write_to_excel(excel_path, data, sheet_name=None, index=None):
201
204
  '''
202
205
  Parameters
203
206
  ----------
204
207
  excel_path : TYPE
205
208
  DESCRIPTION. results.xlsx
206
209
 
210
+ data : TYPE, dict
211
+ DESCRIPTION. data = {'翻译': 24, '教学': 8, '数智': 6, '时代': 6, '财经': 6, '新': 4}
212
+
213
+ sheet_name : TYPE, optional
214
+ DESCRIPTION. The default is None.
215
+
216
+ index : TYPE, optional
217
+ DESCRIPTION. The default is None.
218
+
219
+ Returns
220
+ -------
221
+ None.
222
+
223
+ '''
224
+ import pandas as pd
225
+ if sheet_name is None:
226
+ sheet_name="sheet1"
227
+ else:
228
+ sheet_name=sheet_name
229
+ if index is None:
230
+ index=False
231
+ else:
232
+ index=True
233
+
234
+ col = list(data.keys())
235
+ freq = list(data.values())
236
+ dic_of_list={"items": col, "counts": freq}
237
+
238
+ df=pd.DataFrame(dic_of_list)
239
+ df.style.to_excel(excel_path, sheet_name=sheet_name,startcol=0, index=index)
240
+
241
+ def write_to_excel_normal(excel_path,dic_of_list,sheet_name=None,index=None):
242
+ '''
243
+ Parameters
244
+ ----------
245
+ excel_path : TYPE
246
+ DESCRIPTION. D:\results.xlsx
247
+
207
248
  dic_of_list : TYPE
208
249
  DESCRIPTION. {"col":["a","b","c","d"],"freq":[1,2,3,4]}
209
250
 
@@ -471,6 +512,18 @@ def get_directory_tree_with_meta(start_path, indent='', show_meta=False, max_dir
471
512
  print(f"{indent}└── ... (and {remaining_directories} more directories)")
472
513
  # current_level=-1 will show all folders' info.
473
514
 
515
+ def get_full_path(*path_components):
516
+ """
517
+ Combines multiple path components into a single, full path using os.path.join.
518
+
519
+ Args:
520
+ *path_components: Variable number of path components (strings).
521
+
522
+ Returns:
523
+ str: The combined full path.
524
+ """
525
+ return os.path.join(*path_components)
526
+
474
527
  def get_subfolder_path(parent_folder, subfolder_name):
475
528
  import os
476
529
  subfolder_name=subfolder_name.strip()
@@ -553,7 +606,6 @@ def batch_word_list(input_root):
553
606
  sorted_words=sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
554
607
  return sorted_words
555
608
 
556
-
557
609
  def clean_list(meta):
558
610
  """
559
611
  Parameters
@@ -576,7 +628,6 @@ def clean_list(meta):
576
628
 
577
629
  yhd=["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)','Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0','Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)','Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1','Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13','Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6','NOKIA5700/ UCWEB7.0.2.37/28/999','Openwave/ UCWEB7.0.2.37/28/999','Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10','Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11','Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11','UCWEB7.0.2.37/28/999']
578
630
 
579
-
580
631
  def source_path(relative_path):
581
632
  import sys,os
582
633
  if getattr(sys, 'frozen', False):
@@ -590,7 +641,6 @@ def next_folder_names(folder):
590
641
  folder_namelist=next(os.walk(folder))[1]
591
642
  return folder_namelist
592
643
 
593
-
594
644
  def remove_empty_txts(folder_path):
595
645
  import os
596
646
  files=FilePath(folder_path)
@@ -797,7 +847,6 @@ def cs1(text):
797
847
  sentences=sentences
798
848
  return sentences
799
849
 
800
-
801
850
  def word_tokenize(text, pos_tagged=False):
802
851
  '''
803
852
  Parameters
@@ -838,7 +887,7 @@ def word_tokenize(text, pos_tagged=False):
838
887
  else:
839
888
  try:
840
889
  if "Can not open" in str(err):
841
- user_folder=get_library_location("PgsFile")+"/PgsFile/Corpora/Stopwords"
890
+ user_folder=get_library_location("PgsFile")+"/PgsFile/models"
842
891
  destination_folder=get_library_location("nlpir-python")+"/nlpir/Data"
843
892
  source_file=find_user_files_in_upper_folder(user_folder, "NLPIR")[0]
844
893
  copy_file(source_file, destination_folder)
@@ -850,6 +899,296 @@ def word_tokenize(text, pos_tagged=False):
850
899
 
851
900
  return words
852
901
 
902
+ import re
903
+ from abc import ABC, abstractmethod
904
+ from typing import Iterator, List, Tuple
905
+ class TokenizerI(ABC):
906
+ """
907
+ A processing interface for tokenizing a string.
908
+ Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
909
+ """
910
+
911
+ @abstractmethod
912
+ def tokenize(self, s: str) -> List[str]:
913
+ """
914
+ Return a tokenized copy of *s*.
915
+
916
+ :rtype: List[str]
917
+ """
918
+ if overridden(self.tokenize_sents):
919
+ return self.tokenize_sents([s])[0]
920
+
921
+ def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
922
+ """
923
+ Identify the tokens using integer offsets ``(start_i, end_i)``,
924
+ where ``s[start_i:end_i]`` is the corresponding token.
925
+
926
+ :rtype: Iterator[Tuple[int, int]]
927
+ """
928
+ raise NotImplementedError()
929
+
930
+ def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
931
+ """
932
+ Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
933
+
934
+ return [self.tokenize(s) for s in strings]
935
+
936
+ :rtype: List[List[str]]
937
+ """
938
+ return [self.tokenize(s) for s in strings]
939
+
940
+ def span_tokenize_sents(
941
+ self, strings: List[str]
942
+ ) -> Iterator[List[Tuple[int, int]]]:
943
+ """
944
+ Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
945
+
946
+ return [self.span_tokenize(s) for s in strings]
947
+
948
+ :yield: List[Tuple[int, int]]
949
+ """
950
+ for s in strings:
951
+ yield list(self.span_tokenize(s))
952
+
953
+ class MacIntyreContractions:
954
+ """
955
+ List of contractions adapted from Robert MacIntyre's tokenizer.
956
+ """
957
+
958
+ CONTRACTIONS2 = [
959
+ r"(?i)\b(can)(?#X)(not)\b",
960
+ r"(?i)\b(d)(?#X)('ye)\b",
961
+ r"(?i)\b(gim)(?#X)(me)\b",
962
+ r"(?i)\b(gon)(?#X)(na)\b",
963
+ r"(?i)\b(got)(?#X)(ta)\b",
964
+ r"(?i)\b(lem)(?#X)(me)\b",
965
+ r"(?i)\b(more)(?#X)('n)\b",
966
+ r"(?i)\b(wan)(?#X)(na)(?=\s)",
967
+ ]
968
+ CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
969
+ CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
970
+
971
+ class NLTKWordTokenizer(TokenizerI):
972
+ """
973
+ The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
974
+
975
+ This is the method that is invoked by ``word_tokenize()``. It assumes that the
976
+ text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
977
+
978
+ The tokenizer is "destructive" such that the regexes applied will munge the
979
+ input string to a state beyond re-construction. It is possible to apply
980
+ `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
981
+ `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
982
+ revert to the original string.
983
+ """
984
+
985
+ # Starting quotes.
986
+ STARTING_QUOTES = [
987
+ (re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
988
+ (re.compile(r"^\""), r"``"),
989
+ (re.compile(r"(``)"), r" \1 "),
990
+ (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
991
+ (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
992
+ ]
993
+
994
+ # Ending quotes.
995
+ ENDING_QUOTES = [
996
+ (re.compile("([»”’])", re.U), r" \1 "),
997
+ (re.compile(r"''"), " '' "),
998
+ (re.compile(r'"'), " '' "),
999
+ (re.compile(r"\s+"), " "),
1000
+ (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
1001
+ (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
1002
+ ]
1003
+
1004
+ # For improvements for starting/closing quotes from TreebankWordTokenizer,
1005
+ # see discussion on https://github.com/nltk/nltk/pull/1437
1006
+ # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
1007
+ # - chevron quotes u'\xab' and u'\xbb'
1008
+ # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
1009
+ # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
1010
+ # Also, behavior of splitting on clitics now follows Stanford CoreNLP
1011
+ # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
1012
+
1013
+ # Punctuation.
1014
+ PUNCTUATION = [
1015
+ (re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
1016
+ (re.compile(r"([:,])([^\d])"), r" \1 \2"),
1017
+ (re.compile(r"([:,])$"), r" \1 "),
1018
+ (
1019
+ re.compile(r"\.{2,}", re.U),
1020
+ r" \g<0> ",
1021
+ ), # See https://github.com/nltk/nltk/pull/2322
1022
+ (re.compile(r"[;@#$%&]"), r" \g<0> "),
1023
+ (
1024
+ re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
1025
+ r"\1 \2\3 ",
1026
+ ), # Handles the final period.
1027
+ (re.compile(r"[?!]"), r" \g<0> "),
1028
+ (re.compile(r"([^'])' "), r"\1 ' "),
1029
+ (
1030
+ re.compile(r"[*]", re.U),
1031
+ r" \g<0> ",
1032
+ ), # See https://github.com/nltk/nltk/pull/2322
1033
+ ]
1034
+
1035
+ # Pads parentheses
1036
+ PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
1037
+
1038
+ # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
1039
+ CONVERT_PARENTHESES = [
1040
+ (re.compile(r"\("), "-LRB-"),
1041
+ (re.compile(r"\)"), "-RRB-"),
1042
+ (re.compile(r"\["), "-LSB-"),
1043
+ (re.compile(r"\]"), "-RSB-"),
1044
+ (re.compile(r"\{"), "-LCB-"),
1045
+ (re.compile(r"\}"), "-RCB-"),
1046
+ ]
1047
+
1048
+ DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
1049
+
1050
+ # List of contractions adapted from Robert MacIntyre's tokenizer.
1051
+ _contractions = MacIntyreContractions()
1052
+ CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
1053
+ CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
1054
+
1055
+ def tokenize(
1056
+ self, text: str, convert_parentheses: bool = False, return_str: bool = False
1057
+ ) -> List[str]:
1058
+ r"""Return a tokenized copy of `text`.
1059
+
1060
+ >>> from nltk.tokenize import NLTKWordTokenizer
1061
+ >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
1062
+ >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
1063
+ ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
1064
+ 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
1065
+ 'of', 'them.', 'Thanks', '.']
1066
+ >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
1067
+ ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
1068
+ 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
1069
+ 'of', 'them.', 'Thanks', '.']
1070
+
1071
+
1072
+ :param text: A string with a sentence or sentences.
1073
+ :type text: str
1074
+ :param convert_parentheses: if True, replace parentheses to PTB symbols,
1075
+ e.g. `(` to `-LRB-`. Defaults to False.
1076
+ :type convert_parentheses: bool, optional
1077
+ :param return_str: If True, return tokens as space-separated string,
1078
+ defaults to False.
1079
+ :type return_str: bool, optional
1080
+ :return: List of tokens from `text`.
1081
+ :rtype: List[str]
1082
+ """
1083
+ if return_str:
1084
+ warnings.warn(
1085
+ "Parameter 'return_str' has been deprecated and should no "
1086
+ "longer be used.",
1087
+ category=DeprecationWarning,
1088
+ stacklevel=2,
1089
+ )
1090
+
1091
+ for regexp, substitution in self.STARTING_QUOTES:
1092
+ text = regexp.sub(substitution, text)
1093
+
1094
+ for regexp, substitution in self.PUNCTUATION:
1095
+ text = regexp.sub(substitution, text)
1096
+
1097
+ # Handles parentheses.
1098
+ regexp, substitution = self.PARENS_BRACKETS
1099
+ text = regexp.sub(substitution, text)
1100
+ # Optionally convert parentheses
1101
+ if convert_parentheses:
1102
+ for regexp, substitution in self.CONVERT_PARENTHESES:
1103
+ text = regexp.sub(substitution, text)
1104
+
1105
+ # Handles double dash.
1106
+ regexp, substitution = self.DOUBLE_DASHES
1107
+ text = regexp.sub(substitution, text)
1108
+
1109
+ # add extra space to make things easier
1110
+ text = " " + text + " "
1111
+
1112
+ for regexp, substitution in self.ENDING_QUOTES:
1113
+ text = regexp.sub(substitution, text)
1114
+
1115
+ for regexp in self.CONTRACTIONS2:
1116
+ text = regexp.sub(r" \1 \2 ", text)
1117
+ for regexp in self.CONTRACTIONS3:
1118
+ text = regexp.sub(r" \1 \2 ", text)
1119
+
1120
+ # We are not using CONTRACTIONS4 since
1121
+ # they are also commented out in the SED scripts
1122
+ # for regexp in self._contractions.CONTRACTIONS4:
1123
+ # text = regexp.sub(r' \1 \2 \3 ', text)
1124
+
1125
+ return text.split()
1126
+
1127
+ def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
1128
+ r"""
1129
+ Returns the spans of the tokens in ``text``.
1130
+ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
1131
+
1132
+ >>> from nltk.tokenize import NLTKWordTokenizer
1133
+ >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
1134
+ >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
1135
+ ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
1136
+ ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
1137
+ ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
1138
+ >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
1139
+ True
1140
+ >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
1141
+ ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
1142
+ ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
1143
+ >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
1144
+ True
1145
+
1146
+ :param text: A string with a sentence or sentences.
1147
+ :type text: str
1148
+ :yield: Tuple[int, int]
1149
+ """
1150
+ raw_tokens = self.tokenize(text)
1151
+
1152
+ # Convert converted quotes back to original double quotes
1153
+ # Do this only if original text contains double quote(s) or double
1154
+ # single-quotes (because '' might be transformed to `` if it is
1155
+ # treated as starting quotes).
1156
+ if ('"' in text) or ("''" in text):
1157
+ # Find double quotes and converted quotes
1158
+ matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
1159
+
1160
+ # Replace converted quotes back to double quotes
1161
+ tokens = [
1162
+ matched.pop(0) if tok in ['"', "``", "''"] else tok
1163
+ for tok in raw_tokens
1164
+ ]
1165
+ else:
1166
+ tokens = raw_tokens
1167
+
1168
+ yield from align_tokens(tokens, text)
1169
+
1170
+ # Standard word tokenizer.
1171
+ _treebank_word_tokenizer = NLTKWordTokenizer()
1172
+ def word_tokenize2(text, preserve_line=False):
1173
+ """
1174
+ Return a tokenized copy of *text*,
1175
+ using NLTK's recommended word tokenizer
1176
+ (currently an improved :class:`.TreebankWordTokenizer`
1177
+ along with :class:`.PunktSentenceTokenizer`
1178
+ for the specified language).
1179
+
1180
+ :param text: text to split into words
1181
+ :type text: str
1182
+ :param language: the model name in the Punkt corpus
1183
+ :type language: str
1184
+ :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
1185
+ :type preserve_line: bool
1186
+ """
1187
+ sentences = [text] if preserve_line else sent_tokenize(text)
1188
+ return [
1189
+ token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
1190
+ ]
1191
+
853
1192
  def pad_sequence(
854
1193
  sequence,
855
1194
  n,
@@ -889,9 +1228,7 @@ def pad_sequence(
889
1228
  sequence=chain(sequence, (right_pad_symbol,) * (n - 1))
890
1229
  return sequence
891
1230
 
892
-
893
1231
  # add a flag to pad the sequence so we get peripheral ngrams?
894
-
895
1232
  def ngrams(
896
1233
  sequence,
897
1234
  n,
@@ -954,7 +1291,6 @@ def ngrams(
954
1291
  yield tuple(history)
955
1292
  del history[0]
956
1293
 
957
-
958
1294
  def bigrams(sequence, **kwargs):
959
1295
  """
960
1296
  Return the bigrams generated from a sequence of items, as an iterator.
@@ -974,7 +1310,6 @@ def bigrams(sequence, **kwargs):
974
1310
  for item in ngrams(sequence, 2, **kwargs):
975
1311
  yield item
976
1312
 
977
-
978
1313
  def trigrams(sequence, **kwargs):
979
1314
  """
980
1315
  Return the trigrams generated from a sequence of items, as an iterator.
@@ -994,7 +1329,6 @@ def trigrams(sequence, **kwargs):
994
1329
  for item in ngrams(sequence, 3, **kwargs):
995
1330
  yield item
996
1331
 
997
-
998
1332
  def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
999
1333
  """
1000
1334
  Returns all possible ngrams generated from a sequence of items, as an iterator.
@@ -1148,6 +1482,18 @@ def uninstall_package(package_name: str):
1148
1482
  import pip
1149
1483
  pip.main(['uninstall', package_name, '-y'])
1150
1484
 
1485
+ # A list of conda configuration commands.
1486
+ conda_mirror_commands=[
1487
+ "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple", # Windows recommended
1488
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/", # MacOS recommended
1489
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/",
1490
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/",
1491
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/fastai/",
1492
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/",
1493
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/",
1494
+ "pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/"
1495
+ ]
1496
+
1151
1497
  def DirList(root_dir: str) -> tuple:
1152
1498
  """
1153
1499
  List the contents of a directory and return two lists containing the names of the directories and files in the directory.
@@ -1251,10 +1597,8 @@ def sort_strings_with_embedded_numbers(strings: list) -> list:
1251
1597
  """
1252
1598
  # Sort the strings using the extract_numbers() function as the key
1253
1599
  sorted_strings=sorted(strings, key=extract_numbers)
1254
-
1255
1600
  return sorted_strings
1256
1601
 
1257
-
1258
1602
  def run_command(command: str) -> str:
1259
1603
  """
1260
1604
  Run a command and return its output as a string.
@@ -1444,7 +1788,6 @@ class PGScraper(object):
1444
1788
  return all_want_list
1445
1789
 
1446
1790
 
1447
-
1448
1791
 
1449
1792
  # -*- coding: utf-8 -*-
1450
1793
  """
@@ -1490,7 +1833,6 @@ class ProgressData(object):
1490
1833
  self.size, self.unit, progress, speed, self.unit))
1491
1834
  print('%50s'%('/'*int((1-progress)*50)))
1492
1835
 
1493
-
1494
1836
  def levenshtein_distance(s, t):
1495
1837
  m, n=len(s), len(t)
1496
1838
  if m < n:
@@ -1516,7 +1858,7 @@ pgs_abbres_words=['A.B.','A.D.','A.G.','A.I.','A.M.','A.P.','A.V.','AFP.','Ala.'
1516
1858
  def clean_text(text): #清洗除了句号以外的其他标点符号问题
1517
1859
  # 在标点符号右边邻接单词前添加空格
1518
1860
  import re
1519
- text=replace_chinese_punctuation_with_english(text)
1861
+ # text=replace_chinese_punctuation_with_english(text)
1520
1862
  text=re.sub(r'(?<=[\?\!\,\;\:\)\]\}])\s*(?=\w)', ' ', text)
1521
1863
  # 删除标点符号与左边单词之间的空格
1522
1864
  text=re.sub(r'\s*([\?\!\,\;\:\)\]\}\>])', r'\1', text)
@@ -1532,24 +1874,45 @@ def clean_text(text): #清洗除了句号以外的其他标点符号问题
1532
1874
 
1533
1875
  def clean_text_with_abbreviations(text):
1534
1876
  import re
1535
- text=clean_text(text)
1536
- matches=[]
1537
- for seg in text.split():
1877
+
1878
+ # 按行分割文本
1879
+ lines = text.splitlines()
1880
+
1881
+ # 清洗每一行
1882
+ cleaned_lines = []
1883
+ for line in lines:
1884
+ cleaned_line = clean_line_with_abbreviations(line)
1885
+ cleaned_lines.append(cleaned_line)
1886
+
1887
+ # 将清洗后的行重新组合成文本
1888
+ cleaned_text = '\n'.join(cleaned_lines)
1889
+ return cleaned_text
1890
+
1891
+ def clean_line_with_abbreviations(line):
1892
+ import re
1893
+
1894
+ # 清洗除了句号以外的其他标点符号问题
1895
+ line = clean_text(line)
1896
+
1897
+ matches = []
1898
+ for seg in line.split():
1538
1899
  if "." in seg:
1539
- if seg.endswith(".") is False:
1900
+ if not seg.endswith("."):
1540
1901
  matches.append(seg)
1541
1902
  elif seg.endswith("..") and "..." not in seg:
1542
- text=text.replace("..", ".")
1543
-
1903
+ line = line.replace("..", ".")
1904
+
1544
1905
  for match in matches:
1545
1906
  if any(word in match for word in pgs_abbres_words):
1546
- inter=match.split(".")
1547
- new_match="".join([w+"." for w in inter[0:-1]])+" "+inter[-1]
1548
- text=text.replace(match, new_match)
1907
+ inter = match.split(".")
1908
+ new_match = "".join([w + "." for w in inter[0:-1]]) + " " + inter[-1]
1909
+ line = line.replace(match, new_match)
1549
1910
  else:
1550
- text=text.replace(match, match.replace(".",". "))
1551
- text=re.sub(r'\s+\.', '.', text)
1552
- return text
1911
+ line = line.replace(match, match.replace(".", ". "))
1912
+
1913
+ line = re.sub(r'\s+\.', '.', line)
1914
+ return line
1915
+
1553
1916
 
1554
1917
  import shutil
1555
1918
  def move_file(source_file, destination_folder, new_file_name=None):
@@ -1597,7 +1960,6 @@ def copy_file(source_file, destination_folder, new_file_name=None):
1597
1960
 
1598
1961
  # Copy the file to the destination folder
1599
1962
  shutil.copy2(source_file, destination_file)
1600
-
1601
1963
 
1602
1964
  def check_empty_cells(file_path):
1603
1965
  """
@@ -1636,7 +1998,6 @@ def makefile(file_path):
1636
1998
  else:
1637
1999
  write_to_txt(file_path, "")
1638
2000
 
1639
-
1640
2001
  def save_dict_to_excel(data, output_file, headers=None):
1641
2002
  """
1642
2003
  Save Python dictionary data into an Excel .xlsx file with custom headers.
@@ -1845,4 +2206,126 @@ def get_stopwords(language=None):
1845
2206
  return en_stopwords
1846
2207
  else:
1847
2208
  lang_stopwords=get_data_lines(find_txt_files_with_keyword(stopwords_path, language)[0])
1848
- return lang_stopwords
2209
+ return lang_stopwords
2210
+
2211
+ from PIL import Image
2212
+ def replace_white_with_transparency(input_path, output_path):
2213
+ """
2214
+ This function opens an image, replaces all white pixels with transparent pixels.
2215
+
2216
+ Parameters:
2217
+ input_path (str): The path to the input image file.
2218
+ output_path (str): The path to save the output image file.
2219
+ """
2220
+ # 从RGB(24位)模式转成RGBA(32位)模式
2221
+ img = Image.open(input_path).convert('RGBA')
2222
+ W, L = img.size
2223
+ white_pixel = (0, 0, 0, 0) # white
2224
+ for h in range(W):
2225
+ for i in range(L):
2226
+ if img.getpixel((h, i)) == white_pixel:
2227
+ img.putpixel((h, i), (255, 255, 255, 0)) # make it transparent
2228
+ img.save(output_path)
2229
+
2230
+ def get_font_path(font_name=None):
2231
+ '''
2232
+ Retrieves the file path of a specified font.
2233
+
2234
+ Parameters
2235
+ ----------
2236
+ font_name : str, optional
2237
+ The name of the font file (must end with ".ttf"). If provided, it should match one of the available fonts in the library, such as:
2238
+ - 'DejaVuSans.ttf'
2239
+ - '书体坊赵九江钢笔行书体.ttf'
2240
+ - '全新硬笔楷书简.ttf'
2241
+ - '全新硬笔行书简.ttf'
2242
+ - '博洋行书3500.TTF'
2243
+ - '陆柬之行书字体.ttf'
2244
+ The default is None, which will return the path for 'DejaVuSans.ttf'.
2245
+
2246
+ Returns
2247
+ -------
2248
+ font_path : str
2249
+ The full file path of the specified font. If no font name is provided, the default path for 'DejaVuSans.ttf' will be returned.
2250
+ Example: "C:/Windows/Fonts/simhei.ttf"
2251
+ '''
2252
+
2253
+ font_folder = get_library_location("PgsFile") + "/PgsFile/models/fonts"
2254
+ if font_name is None:
2255
+ font_path = get_full_path(font_folder, "DejaVuSans.ttf")
2256
+ else:
2257
+ font_path = get_full_path(font_folder, font_name)
2258
+ return font_path
2259
+
2260
+ simhei_default_font_path_MacOS_Windows=["/System/Library/Fonts/STHeiti Medium.ttc",
2261
+ r"C:\Windows\Fonts\simhei.ttf", # Use a font that supports Chinese characters
2262
+ ]
2263
+
2264
+
2265
+ def get_env_variable(variable_name):
2266
+ # Get the value of the specified environment variable
2267
+ value = os.getenv(variable_name)
2268
+
2269
+ # Check if the environment variable is set
2270
+ if value is not None:
2271
+ print(f"{variable_name} is set to: {value}")
2272
+ else:
2273
+ print(f"{variable_name} is not set.")
2274
+
2275
+ import subprocess
2276
+ def set_permanent_environment_variable(variable_name, variable_value, system_wide=False):
2277
+ """
2278
+ Sets a permanent environment variable on Windows using the `setx` command.
2279
+
2280
+ Args:
2281
+ variable_name (str): The name of the environment variable.
2282
+ variable_value (str): The value to set for the environment variable.
2283
+ system_wide (bool): If True, sets the variable system-wide (requires admin privileges).
2284
+ If False, sets the variable for the current user only.
2285
+ """
2286
+ try:
2287
+ # Construct the setx command
2288
+ command = ['setx', variable_name, variable_value]
2289
+ if system_wide:
2290
+ command.append('/M') # Add /M flag for system-wide variables
2291
+
2292
+ # Run the command
2293
+ subprocess.run(command, shell=True, check=True)
2294
+
2295
+ print(f'Permanent environment variable {variable_name} set to {variable_value} '
2296
+ f'({"system-wide" if system_wide else "user-level"}).')
2297
+ except subprocess.CalledProcessError as e:
2298
+ print(f'Failed to set environment variable: {e}')
2299
+ except Exception as e:
2300
+ print(f'An error occurred: {e}')
2301
+
2302
+ def delete_permanent_environment_variable(variable_name, system_wide=False):
2303
+ """
2304
+ Deletes a permanent environment variable on Windows using the `reg` command.
2305
+
2306
+ Args:
2307
+ variable_name (str): The name of the environment variable to delete.
2308
+ system_wide (bool): If True, deletes the variable system-wide (requires admin privileges).
2309
+ If False, deletes the variable for the current user only.
2310
+ """
2311
+ try:
2312
+ # Determine the registry key based on the scope
2313
+ if system_wide:
2314
+ reg_key = r'HKLM\SYSTEM\CurrentControlSet\Control\Session Manager\Environment'
2315
+ else:
2316
+ reg_key = r'HKCU\Environment'
2317
+
2318
+ # Run the `reg delete` command to remove the variable
2319
+ subprocess.run(
2320
+ ['reg', 'delete', reg_key, '/v', variable_name, '/f'],
2321
+ shell=True,
2322
+ check=True
2323
+ )
2324
+
2325
+ print(f'Permanent environment variable {variable_name} deleted '
2326
+ f'({"system-wide" if system_wide else "user-level"}).')
2327
+ except subprocess.CalledProcessError as e:
2328
+ print(f'Failed to delete environment variable: {e}')
2329
+ except Exception as e:
2330
+ print(f'An error occurred: {e}')
2331
+