PgsFile 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

Files changed (18) hide show
  1. PgsFile/Corpora/Stopwords/NLPIR.user +0 -0
  2. PgsFile/PgsFile.py +465 -34
  3. PgsFile/__init__.py +10 -4
  4. PgsFile/models/NLPIR.user +0 -0
  5. PgsFile/models/fonts/DejaVuSans.ttf +0 -0
  6. PgsFile/models/fonts//321/204/342/225/243/320/266/321/204/342/225/234/320/243/321/205/320/255/320/232/321/210/342/225/241/342/225/241/321/204/342/225/243/320/255/321/206/342/226/222/320/257/321/211/320/242/320/262/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/204/342/225/234/320/243.ttf +0 -0
  7. PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/206/320/265/342/225/226/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
  8. PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
  9. PgsFile/models/fonts//321/205/320/235/320/252/321/206/342/224/244/320/233/321/210/320/261/320/234/321/204/342/225/243/320/2663500.TTF +0 -0
  10. PgsFile/models/fonts//321/211/320/251/320/226/321/206/320/257/320/274/321/204/342/225/243/320/233/321/210/320/261/320/234/321/204/342/225/243/320/266/321/205/320/275/320/247/321/204/342/225/234/320/243.ttf +0 -0
  11. PgsFile/models/model_reviews2.2.bin +0 -0
  12. PgsFile/models/model_reviews_ReadMe.txt +134 -0
  13. PgsFile-0.2.4.dist-info/METADATA +41 -0
  14. {PgsFile-0.2.2.dist-info → PgsFile-0.2.4.dist-info}/RECORD +17 -7
  15. PgsFile-0.2.2.dist-info/METADATA +0 -79
  16. {PgsFile-0.2.2.dist-info → PgsFile-0.2.4.dist-info}/LICENSE +0 -0
  17. {PgsFile-0.2.2.dist-info → PgsFile-0.2.4.dist-info}/WHEEL +0 -0
  18. {PgsFile-0.2.2.dist-info → PgsFile-0.2.4.dist-info}/top_level.txt +0 -0
Binary file
PgsFile/PgsFile.py CHANGED
@@ -103,7 +103,7 @@ def get_data_text(path):
103
103
  else:
104
104
  return None
105
105
 
106
- def get_data_lines(path):
106
+ def get_data_lines(path, no_line_breaks=False):
107
107
  '''
108
108
  Parameters
109
109
  ----------
@@ -133,7 +133,10 @@ def get_data_lines(path):
133
133
  # Read the entire file using the detected encoding
134
134
  if encoding:
135
135
  with open(path, 'r', encoding=encoding, errors="ignore") as f:
136
- lines = [l.strip() for l in f.readlines() if len(l.strip()) != 0]
136
+ if no_line_breaks is False:
137
+ lines = [l.strip() for l in f.readlines() if len(l.strip()) != 0]
138
+ else:
139
+ lines = f.readlines()
137
140
  return lines
138
141
  else:
139
142
  return None
@@ -197,15 +200,15 @@ def get_data_excel(excel_path,column_id,sheet_name=None):
197
200
  inter=df.iloc[0:,column_id] #提取第二列所有行
198
201
  return list(inter)
199
202
 
200
- def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
203
+ def write_to_excel(excel_path, data, sheet_name=None, index=None):
201
204
  '''
202
205
  Parameters
203
206
  ----------
204
207
  excel_path : TYPE
205
208
  DESCRIPTION. results.xlsx
206
209
 
207
- dic_of_list : TYPE
208
- DESCRIPTION. {"col":["a","b","c","d"],"freq":[1,2,3,4]}
210
+ data : TYPE, dict
211
+ DESCRIPTION. data = {'翻译': 24, '教学': 8, '数智': 6, '时代': 6, '财经': 6, '新': 4}
209
212
 
210
213
  sheet_name : TYPE, optional
211
214
  DESCRIPTION. The default is None.
@@ -227,6 +230,10 @@ def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
227
230
  index=False
228
231
  else:
229
232
  index=True
233
+
234
+ col = list(data.keys())
235
+ freq = list(data.values())
236
+ dic_of_list={"items": col, "counts": freq}
230
237
 
231
238
  df=pd.DataFrame(dic_of_list)
232
239
  df.style.to_excel(excel_path, sheet_name=sheet_name,startcol=0, index=index)
@@ -471,6 +478,18 @@ def get_directory_tree_with_meta(start_path, indent='', show_meta=False, max_dir
471
478
  print(f"{indent}└── ... (and {remaining_directories} more directories)")
472
479
  # current_level=-1 will show all folders' info.
473
480
 
481
+ def get_full_path(*path_components):
482
+ """
483
+ Combines multiple path components into a single, full path using os.path.join.
484
+
485
+ Args:
486
+ *path_components: Variable number of path components (strings).
487
+
488
+ Returns:
489
+ str: The combined full path.
490
+ """
491
+ return os.path.join(*path_components)
492
+
474
493
  def get_subfolder_path(parent_folder, subfolder_name):
475
494
  import os
476
495
  subfolder_name=subfolder_name.strip()
@@ -553,7 +572,6 @@ def batch_word_list(input_root):
553
572
  sorted_words=sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
554
573
  return sorted_words
555
574
 
556
-
557
575
  def clean_list(meta):
558
576
  """
559
577
  Parameters
@@ -576,7 +594,6 @@ def clean_list(meta):
576
594
 
577
595
  yhd=["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)','Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0','Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)','Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1','Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13','Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6','NOKIA5700/ UCWEB7.0.2.37/28/999','Openwave/ UCWEB7.0.2.37/28/999','Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10','Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11','Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11','UCWEB7.0.2.37/28/999']
578
596
 
579
-
580
597
  def source_path(relative_path):
581
598
  import sys,os
582
599
  if getattr(sys, 'frozen', False):
@@ -590,7 +607,6 @@ def next_folder_names(folder):
590
607
  folder_namelist=next(os.walk(folder))[1]
591
608
  return folder_namelist
592
609
 
593
-
594
610
  def remove_empty_txts(folder_path):
595
611
  import os
596
612
  files=FilePath(folder_path)
@@ -700,6 +716,21 @@ def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
700
716
  matches.append(os.path.join(root, filename))
701
717
  return matches
702
718
 
719
+ import fnmatch
720
+ def find_user_files_in_upper_folder(directory, user_file_name):
721
+ # Get the direct upper folder path
722
+ upper_folder = os.path.dirname(os.path.abspath(directory))
723
+
724
+ # List to store matching file paths
725
+ matching_files = []
726
+
727
+ # Walk through the upper folder
728
+ for root, dirs, files in os.walk(upper_folder):
729
+ for filename in fnmatch.filter(files, f'{user_file_name}.user'):
730
+ matching_files.append(os.path.join(root, filename))
731
+
732
+ return matching_files
733
+
703
734
  # Standard sentence tokenizer.
704
735
  def sent_tokenize(text, lang=None):
705
736
  import pysbd
@@ -816,12 +847,314 @@ def word_tokenize(text, pos_tagged=False):
816
847
  try:
817
848
  from nlpir import tools
818
849
  tools.update_license()
850
+ print("\n\nThe user file is ready. Please restart your kernel and run the Python script!")
819
851
  except Exception as err2:
820
- print("You need a VPN to try this service!", err2)
852
+ print("\n*****SOLUTION WARNING! \nYOU MAY NEED A VPN TO TRY THIS SERVICE!*****\n\n", err2)
821
853
  else:
822
- print(err)
854
+ try:
855
+ if "Can not open" in str(err):
856
+ user_folder=get_library_location("PgsFile")+"/PgsFile/models"
857
+ destination_folder=get_library_location("nlpir-python")+"/nlpir/Data"
858
+ source_file=find_user_files_in_upper_folder(user_folder, "NLPIR")[0]
859
+ copy_file(source_file, destination_folder)
860
+ print("The user file is ready. Please restart your kernel and run the Python script!")
861
+ else:
862
+ print(err)
863
+ except Exception as rer:
864
+ print(rer)
865
+
823
866
  return words
824
867
 
868
+ import re
869
+ from abc import ABC, abstractmethod
870
+ from typing import Iterator, List, Tuple
871
+ class TokenizerI(ABC):
872
+ """
873
+ A processing interface for tokenizing a string.
874
+ Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
875
+ """
876
+
877
+ @abstractmethod
878
+ def tokenize(self, s: str) -> List[str]:
879
+ """
880
+ Return a tokenized copy of *s*.
881
+
882
+ :rtype: List[str]
883
+ """
884
+ if overridden(self.tokenize_sents):
885
+ return self.tokenize_sents([s])[0]
886
+
887
+ def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
888
+ """
889
+ Identify the tokens using integer offsets ``(start_i, end_i)``,
890
+ where ``s[start_i:end_i]`` is the corresponding token.
891
+
892
+ :rtype: Iterator[Tuple[int, int]]
893
+ """
894
+ raise NotImplementedError()
895
+
896
+ def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
897
+ """
898
+ Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
899
+
900
+ return [self.tokenize(s) for s in strings]
901
+
902
+ :rtype: List[List[str]]
903
+ """
904
+ return [self.tokenize(s) for s in strings]
905
+
906
+ def span_tokenize_sents(
907
+ self, strings: List[str]
908
+ ) -> Iterator[List[Tuple[int, int]]]:
909
+ """
910
+ Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
911
+
912
+ return [self.span_tokenize(s) for s in strings]
913
+
914
+ :yield: List[Tuple[int, int]]
915
+ """
916
+ for s in strings:
917
+ yield list(self.span_tokenize(s))
918
+
919
+ class MacIntyreContractions:
920
+ """
921
+ List of contractions adapted from Robert MacIntyre's tokenizer.
922
+ """
923
+
924
+ CONTRACTIONS2 = [
925
+ r"(?i)\b(can)(?#X)(not)\b",
926
+ r"(?i)\b(d)(?#X)('ye)\b",
927
+ r"(?i)\b(gim)(?#X)(me)\b",
928
+ r"(?i)\b(gon)(?#X)(na)\b",
929
+ r"(?i)\b(got)(?#X)(ta)\b",
930
+ r"(?i)\b(lem)(?#X)(me)\b",
931
+ r"(?i)\b(more)(?#X)('n)\b",
932
+ r"(?i)\b(wan)(?#X)(na)(?=\s)",
933
+ ]
934
+ CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
935
+ CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
936
+
937
+ class NLTKWordTokenizer(TokenizerI):
938
+ """
939
+ The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
940
+
941
+ This is the method that is invoked by ``word_tokenize()``. It assumes that the
942
+ text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
943
+
944
+ The tokenizer is "destructive" such that the regexes applied will munge the
945
+ input string to a state beyond re-construction. It is possible to apply
946
+ `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
947
+ `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
948
+ revert to the original string.
949
+ """
950
+
951
+ # Starting quotes.
952
+ STARTING_QUOTES = [
953
+ (re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
954
+ (re.compile(r"^\""), r"``"),
955
+ (re.compile(r"(``)"), r" \1 "),
956
+ (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
957
+ (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
958
+ ]
959
+
960
+ # Ending quotes.
961
+ ENDING_QUOTES = [
962
+ (re.compile("([»”’])", re.U), r" \1 "),
963
+ (re.compile(r"''"), " '' "),
964
+ (re.compile(r'"'), " '' "),
965
+ (re.compile(r"\s+"), " "),
966
+ (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
967
+ (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
968
+ ]
969
+
970
+ # For improvements for starting/closing quotes from TreebankWordTokenizer,
971
+ # see discussion on https://github.com/nltk/nltk/pull/1437
972
+ # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
973
+ # - chevron quotes u'\xab' and u'\xbb'
974
+ # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
975
+ # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
976
+ # Also, behavior of splitting on clitics now follows Stanford CoreNLP
977
+ # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
978
+
979
+ # Punctuation.
980
+ PUNCTUATION = [
981
+ (re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
982
+ (re.compile(r"([:,])([^\d])"), r" \1 \2"),
983
+ (re.compile(r"([:,])$"), r" \1 "),
984
+ (
985
+ re.compile(r"\.{2,}", re.U),
986
+ r" \g<0> ",
987
+ ), # See https://github.com/nltk/nltk/pull/2322
988
+ (re.compile(r"[;@#$%&]"), r" \g<0> "),
989
+ (
990
+ re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
991
+ r"\1 \2\3 ",
992
+ ), # Handles the final period.
993
+ (re.compile(r"[?!]"), r" \g<0> "),
994
+ (re.compile(r"([^'])' "), r"\1 ' "),
995
+ (
996
+ re.compile(r"[*]", re.U),
997
+ r" \g<0> ",
998
+ ), # See https://github.com/nltk/nltk/pull/2322
999
+ ]
1000
+
1001
+ # Pads parentheses
1002
+ PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
1003
+
1004
+ # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
1005
+ CONVERT_PARENTHESES = [
1006
+ (re.compile(r"\("), "-LRB-"),
1007
+ (re.compile(r"\)"), "-RRB-"),
1008
+ (re.compile(r"\["), "-LSB-"),
1009
+ (re.compile(r"\]"), "-RSB-"),
1010
+ (re.compile(r"\{"), "-LCB-"),
1011
+ (re.compile(r"\}"), "-RCB-"),
1012
+ ]
1013
+
1014
+ DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
1015
+
1016
+ # List of contractions adapted from Robert MacIntyre's tokenizer.
1017
+ _contractions = MacIntyreContractions()
1018
+ CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
1019
+ CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
1020
+
1021
+ def tokenize(
1022
+ self, text: str, convert_parentheses: bool = False, return_str: bool = False
1023
+ ) -> List[str]:
1024
+ r"""Return a tokenized copy of `text`.
1025
+
1026
+ >>> from nltk.tokenize import NLTKWordTokenizer
1027
+ >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
1028
+ >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
1029
+ ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
1030
+ 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
1031
+ 'of', 'them.', 'Thanks', '.']
1032
+ >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
1033
+ ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
1034
+ 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
1035
+ 'of', 'them.', 'Thanks', '.']
1036
+
1037
+
1038
+ :param text: A string with a sentence or sentences.
1039
+ :type text: str
1040
+ :param convert_parentheses: if True, replace parentheses to PTB symbols,
1041
+ e.g. `(` to `-LRB-`. Defaults to False.
1042
+ :type convert_parentheses: bool, optional
1043
+ :param return_str: If True, return tokens as space-separated string,
1044
+ defaults to False.
1045
+ :type return_str: bool, optional
1046
+ :return: List of tokens from `text`.
1047
+ :rtype: List[str]
1048
+ """
1049
+ if return_str:
1050
+ warnings.warn(
1051
+ "Parameter 'return_str' has been deprecated and should no "
1052
+ "longer be used.",
1053
+ category=DeprecationWarning,
1054
+ stacklevel=2,
1055
+ )
1056
+
1057
+ for regexp, substitution in self.STARTING_QUOTES:
1058
+ text = regexp.sub(substitution, text)
1059
+
1060
+ for regexp, substitution in self.PUNCTUATION:
1061
+ text = regexp.sub(substitution, text)
1062
+
1063
+ # Handles parentheses.
1064
+ regexp, substitution = self.PARENS_BRACKETS
1065
+ text = regexp.sub(substitution, text)
1066
+ # Optionally convert parentheses
1067
+ if convert_parentheses:
1068
+ for regexp, substitution in self.CONVERT_PARENTHESES:
1069
+ text = regexp.sub(substitution, text)
1070
+
1071
+ # Handles double dash.
1072
+ regexp, substitution = self.DOUBLE_DASHES
1073
+ text = regexp.sub(substitution, text)
1074
+
1075
+ # add extra space to make things easier
1076
+ text = " " + text + " "
1077
+
1078
+ for regexp, substitution in self.ENDING_QUOTES:
1079
+ text = regexp.sub(substitution, text)
1080
+
1081
+ for regexp in self.CONTRACTIONS2:
1082
+ text = regexp.sub(r" \1 \2 ", text)
1083
+ for regexp in self.CONTRACTIONS3:
1084
+ text = regexp.sub(r" \1 \2 ", text)
1085
+
1086
+ # We are not using CONTRACTIONS4 since
1087
+ # they are also commented out in the SED scripts
1088
+ # for regexp in self._contractions.CONTRACTIONS4:
1089
+ # text = regexp.sub(r' \1 \2 \3 ', text)
1090
+
1091
+ return text.split()
1092
+
1093
+ def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
1094
+ r"""
1095
+ Returns the spans of the tokens in ``text``.
1096
+ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
1097
+
1098
+ >>> from nltk.tokenize import NLTKWordTokenizer
1099
+ >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
1100
+ >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
1101
+ ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
1102
+ ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
1103
+ ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
1104
+ >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
1105
+ True
1106
+ >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
1107
+ ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
1108
+ ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
1109
+ >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
1110
+ True
1111
+
1112
+ :param text: A string with a sentence or sentences.
1113
+ :type text: str
1114
+ :yield: Tuple[int, int]
1115
+ """
1116
+ raw_tokens = self.tokenize(text)
1117
+
1118
+ # Convert converted quotes back to original double quotes
1119
+ # Do this only if original text contains double quote(s) or double
1120
+ # single-quotes (because '' might be transformed to `` if it is
1121
+ # treated as starting quotes).
1122
+ if ('"' in text) or ("''" in text):
1123
+ # Find double quotes and converted quotes
1124
+ matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
1125
+
1126
+ # Replace converted quotes back to double quotes
1127
+ tokens = [
1128
+ matched.pop(0) if tok in ['"', "``", "''"] else tok
1129
+ for tok in raw_tokens
1130
+ ]
1131
+ else:
1132
+ tokens = raw_tokens
1133
+
1134
+ yield from align_tokens(tokens, text)
1135
+
1136
+ # Standard word tokenizer.
1137
+ _treebank_word_tokenizer = NLTKWordTokenizer()
1138
+ def word_tokenize2(text, preserve_line=False):
1139
+ """
1140
+ Return a tokenized copy of *text*,
1141
+ using NLTK's recommended word tokenizer
1142
+ (currently an improved :class:`.TreebankWordTokenizer`
1143
+ along with :class:`.PunktSentenceTokenizer`
1144
+ for the specified language).
1145
+
1146
+ :param text: text to split into words
1147
+ :type text: str
1148
+ :param language: the model name in the Punkt corpus
1149
+ :type language: str
1150
+ :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
1151
+ :type preserve_line: bool
1152
+ """
1153
+ sentences = [text] if preserve_line else sent_tokenize(text)
1154
+ return [
1155
+ token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
1156
+ ]
1157
+
825
1158
  def pad_sequence(
826
1159
  sequence,
827
1160
  n,
@@ -861,9 +1194,7 @@ def pad_sequence(
861
1194
  sequence=chain(sequence, (right_pad_symbol,) * (n - 1))
862
1195
  return sequence
863
1196
 
864
-
865
1197
  # add a flag to pad the sequence so we get peripheral ngrams?
866
-
867
1198
  def ngrams(
868
1199
  sequence,
869
1200
  n,
@@ -926,7 +1257,6 @@ def ngrams(
926
1257
  yield tuple(history)
927
1258
  del history[0]
928
1259
 
929
-
930
1260
  def bigrams(sequence, **kwargs):
931
1261
  """
932
1262
  Return the bigrams generated from a sequence of items, as an iterator.
@@ -946,7 +1276,6 @@ def bigrams(sequence, **kwargs):
946
1276
  for item in ngrams(sequence, 2, **kwargs):
947
1277
  yield item
948
1278
 
949
-
950
1279
  def trigrams(sequence, **kwargs):
951
1280
  """
952
1281
  Return the trigrams generated from a sequence of items, as an iterator.
@@ -966,7 +1295,6 @@ def trigrams(sequence, **kwargs):
966
1295
  for item in ngrams(sequence, 3, **kwargs):
967
1296
  yield item
968
1297
 
969
-
970
1298
  def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
971
1299
  """
972
1300
  Returns all possible ngrams generated from a sequence of items, as an iterator.
@@ -1120,6 +1448,18 @@ def uninstall_package(package_name: str):
1120
1448
  import pip
1121
1449
  pip.main(['uninstall', package_name, '-y'])
1122
1450
 
1451
+ # A list of conda configuration commands.
1452
+ conda_mirror_commands=[
1453
+ "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple", # Windows recommended
1454
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/", # MacOS recommended
1455
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/",
1456
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/",
1457
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/fastai/",
1458
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/",
1459
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/",
1460
+ "pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/"
1461
+ ]
1462
+
1123
1463
  def DirList(root_dir: str) -> tuple:
1124
1464
  """
1125
1465
  List the contents of a directory and return two lists containing the names of the directories and files in the directory.
@@ -1223,10 +1563,8 @@ def sort_strings_with_embedded_numbers(strings: list) -> list:
1223
1563
  """
1224
1564
  # Sort the strings using the extract_numbers() function as the key
1225
1565
  sorted_strings=sorted(strings, key=extract_numbers)
1226
-
1227
1566
  return sorted_strings
1228
1567
 
1229
-
1230
1568
  def run_command(command: str) -> str:
1231
1569
  """
1232
1570
  Run a command and return its output as a string.
@@ -1416,7 +1754,6 @@ class PGScraper(object):
1416
1754
  return all_want_list
1417
1755
 
1418
1756
 
1419
-
1420
1757
 
1421
1758
  # -*- coding: utf-8 -*-
1422
1759
  """
@@ -1462,7 +1799,6 @@ class ProgressData(object):
1462
1799
  self.size, self.unit, progress, speed, self.unit))
1463
1800
  print('%50s'%('/'*int((1-progress)*50)))
1464
1801
 
1465
-
1466
1802
  def levenshtein_distance(s, t):
1467
1803
  m, n=len(s), len(t)
1468
1804
  if m < n:
@@ -1488,7 +1824,7 @@ pgs_abbres_words=['A.B.','A.D.','A.G.','A.I.','A.M.','A.P.','A.V.','AFP.','Ala.'
1488
1824
  def clean_text(text): #清洗除了句号以外的其他标点符号问题
1489
1825
  # 在标点符号右边邻接单词前添加空格
1490
1826
  import re
1491
- text=replace_chinese_punctuation_with_english(text)
1827
+ # text=replace_chinese_punctuation_with_english(text)
1492
1828
  text=re.sub(r'(?<=[\?\!\,\;\:\)\]\}])\s*(?=\w)', ' ', text)
1493
1829
  # 删除标点符号与左边单词之间的空格
1494
1830
  text=re.sub(r'\s*([\?\!\,\;\:\)\]\}\>])', r'\1', text)
@@ -1504,24 +1840,45 @@ def clean_text(text): #清洗除了句号以外的其他标点符号问题
1504
1840
 
1505
1841
  def clean_text_with_abbreviations(text):
1506
1842
  import re
1507
- text=clean_text(text)
1508
- matches=[]
1509
- for seg in text.split():
1843
+
1844
+ # 按行分割文本
1845
+ lines = text.splitlines()
1846
+
1847
+ # 清洗每一行
1848
+ cleaned_lines = []
1849
+ for line in lines:
1850
+ cleaned_line = clean_line_with_abbreviations(line)
1851
+ cleaned_lines.append(cleaned_line)
1852
+
1853
+ # 将清洗后的行重新组合成文本
1854
+ cleaned_text = '\n'.join(cleaned_lines)
1855
+ return cleaned_text
1856
+
1857
+ def clean_line_with_abbreviations(line):
1858
+ import re
1859
+
1860
+ # 清洗除了句号以外的其他标点符号问题
1861
+ line = clean_text(line)
1862
+
1863
+ matches = []
1864
+ for seg in line.split():
1510
1865
  if "." in seg:
1511
- if seg.endswith(".") is False:
1866
+ if not seg.endswith("."):
1512
1867
  matches.append(seg)
1513
1868
  elif seg.endswith("..") and "..." not in seg:
1514
- text=text.replace("..", ".")
1515
-
1869
+ line = line.replace("..", ".")
1870
+
1516
1871
  for match in matches:
1517
1872
  if any(word in match for word in pgs_abbres_words):
1518
- inter=match.split(".")
1519
- new_match="".join([w+"." for w in inter[0:-1]])+" "+inter[-1]
1520
- text=text.replace(match, new_match)
1873
+ inter = match.split(".")
1874
+ new_match = "".join([w + "." for w in inter[0:-1]]) + " " + inter[-1]
1875
+ line = line.replace(match, new_match)
1521
1876
  else:
1522
- text=text.replace(match, match.replace(".",". "))
1523
- text=re.sub(r'\s+\.', '.', text)
1524
- return text
1877
+ line = line.replace(match, match.replace(".", ". "))
1878
+
1879
+ line = re.sub(r'\s+\.', '.', line)
1880
+ return line
1881
+
1525
1882
 
1526
1883
  import shutil
1527
1884
  def move_file(source_file, destination_folder, new_file_name=None):
@@ -1547,6 +1904,28 @@ def move_file(source_file, destination_folder, new_file_name=None):
1547
1904
  shutil.move(source_file, destination_file)
1548
1905
 
1549
1906
  print(f"File moved from {source_file} to {destination_file}")
1907
+
1908
+ def copy_file(source_file, destination_folder, new_file_name=None):
1909
+ """
1910
+ Copy a file to another folder.
1911
+
1912
+ Parameters:
1913
+ source_file (str): The path to the source file.
1914
+ destination_folder (str): The path to the destination folder.
1915
+ new_file_name (str, optional): The new name for the file in the destination folder. Defaults to None.
1916
+ """
1917
+ # Ensure the destination folder exists
1918
+ if not os.path.exists(destination_folder):
1919
+ os.makedirs(destination_folder)
1920
+
1921
+ # Construct the destination file path
1922
+ if new_file_name:
1923
+ destination_file = os.path.join(destination_folder, new_file_name)
1924
+ else:
1925
+ destination_file = os.path.join(destination_folder, os.path.basename(source_file))
1926
+
1927
+ # Copy the file to the destination folder
1928
+ shutil.copy2(source_file, destination_file)
1550
1929
 
1551
1930
  def check_empty_cells(file_path):
1552
1931
  """
@@ -1585,7 +1964,6 @@ def makefile(file_path):
1585
1964
  else:
1586
1965
  write_to_txt(file_path, "")
1587
1966
 
1588
-
1589
1967
  def save_dict_to_excel(data, output_file, headers=None):
1590
1968
  """
1591
1969
  Save Python dictionary data into an Excel .xlsx file with custom headers.
@@ -1794,4 +2172,57 @@ def get_stopwords(language=None):
1794
2172
  return en_stopwords
1795
2173
  else:
1796
2174
  lang_stopwords=get_data_lines(find_txt_files_with_keyword(stopwords_path, language)[0])
1797
- return lang_stopwords
2175
+ return lang_stopwords
2176
+
2177
+ from PIL import Image
2178
+ def replace_white_with_transparency(input_path, output_path):
2179
+ """
2180
+ This function opens an image, replaces all white pixels with transparent pixels.
2181
+
2182
+ Parameters:
2183
+ input_path (str): The path to the input image file.
2184
+ output_path (str): The path to save the output image file.
2185
+ """
2186
+ # 从RGB(24位)模式转成RGBA(32位)模式
2187
+ img = Image.open(input_path).convert('RGBA')
2188
+ W, L = img.size
2189
+ white_pixel = (0, 0, 0, 0) # white
2190
+ for h in range(W):
2191
+ for i in range(L):
2192
+ if img.getpixel((h, i)) == white_pixel:
2193
+ img.putpixel((h, i), (255, 255, 255, 0)) # make it transparent
2194
+ img.save(output_path)
2195
+
2196
+ def get_font_path(font_name=None):
2197
+ '''
2198
+ Retrieves the file path of a specified font.
2199
+
2200
+ Parameters
2201
+ ----------
2202
+ font_name : str, optional
2203
+ The name of the font file (must end with ".ttf"). If provided, it should match one of the available fonts in the library, such as:
2204
+ - 'DejaVuSans.ttf'
2205
+ - '书体坊赵九江钢笔行书体.ttf'
2206
+ - '全新硬笔楷书简.ttf'
2207
+ - '全新硬笔行书简.ttf'
2208
+ - '博洋行书3500.TTF'
2209
+ - '陆柬之行书字体.ttf'
2210
+ The default is None, which will return the path for 'DejaVuSans.ttf'.
2211
+
2212
+ Returns
2213
+ -------
2214
+ font_path : str
2215
+ The full file path of the specified font. If no font name is provided, the default path for 'DejaVuSans.ttf' will be returned.
2216
+ Example: "C:/Windows/Fonts/simhei.ttf"
2217
+ '''
2218
+
2219
+ font_folder = get_library_location("PgsFile") + "/PgsFile/models/fonts"
2220
+ if font_name is None:
2221
+ font_path = get_full_path(font_folder, "DejaVuSans.ttf")
2222
+ else:
2223
+ font_path = get_full_path(font_folder, font_name)
2224
+ return font_path
2225
+
2226
+ simhei_default_font_path_MacOS_Windows=["/System/Library/Fonts/STHeiti Medium.ttc",
2227
+ r"C:\Windows\Fonts\simhei.ttf", # Use a font that supports Chinese characters
2228
+ ]
PgsFile/__init__.py CHANGED
@@ -7,6 +7,7 @@ from .PgsFile import headers, encode_chinese_keyword_for_url
7
7
  from .PgsFile import install_package, uninstall_package
8
8
  from .PgsFile import run_script, run_command
9
9
  from .PgsFile import get_library_location
10
+ from .PgsFile import conda_mirror_commands
10
11
 
11
12
  # 3. Text data retrieval
12
13
  from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
@@ -19,10 +20,10 @@ from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_
19
20
 
20
21
  # 5. File/folder process
21
22
  from .PgsFile import FilePath, FileName, DirList
22
- from .PgsFile import get_subfolder_path
23
+ from .PgsFile import get_subfolder_path, get_full_path
23
24
  from .PgsFile import makedirec, makefile
24
25
  from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
25
- from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
26
+ from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file, copy_file
26
27
  from .PgsFile import concatenate_excel_files
27
28
 
28
29
  # 6. Data cleaning
@@ -32,18 +33,23 @@ from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, L
32
33
  from .PgsFile import check_contain_chinese, check_contain_number
33
34
  from .PgsFile import replace_chinese_punctuation_with_english
34
35
  from .PgsFile import replace_english_punctuation_with_chinese
35
- from .PgsFile import clean_list, clean_text_with_abbreviations
36
+ from .PgsFile import clean_list, clean_text, clean_text_with_abbreviations, clean_line_with_abbreviations
36
37
  from .PgsFile import extract_chinese_punctuation, generate_password, sort_strings_with_embedded_numbers
37
38
 
38
39
  # 7. NLP (natural language processing)
39
40
  from .PgsFile import strQ2B_raw, strQ2B_words
40
41
  from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
41
42
  from .PgsFile import word_list, batch_word_list
42
- from .PgsFile import cs, cs1, sent_tokenize, word_tokenize
43
+ from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
43
44
 
44
45
  # 8. Maths
45
46
  from .PgsFile import len_rows, check_empty_cells
46
47
  from .PgsFile import format_float, decimal_to_percent, Percentage
47
48
  from .PgsFile import get_text_length_kb, extract_numbers
48
49
 
50
+ # 9. Visualization
51
+ from .PgsFile import replace_white_with_transparency
52
+ from .PgsFile import simhei_default_font_path_MacOS_Windows
53
+ from .PgsFile import get_font_path
54
+
49
55
  name = "PgsFile"
Binary file
Binary file
Binary file
@@ -0,0 +1,134 @@
1
+ model_1.0.bin ['samples: 30', 'precision: 0.7666666666666667', 'recall: 0.696969696969697', 'F1: 0.7301587301587302']
2
+ model_1.2.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
3
+ model_1.4.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
4
+ model_1.5.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
5
+ model_1.6.bin ['samples: 30', 'precision: 0.9', 'recall: 0.8181818181818182', 'F1: 0.8571428571428572']
6
+ model_1.7.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.7878787878787878', 'F1: 0.8253968253968254']
7
+ model_1.8.bin ['samples: 30', 'precision: 0.8', 'recall: 0.7272727272727273', 'F1: 0.761904761904762']
8
+ model_1.9.bin ['samples: 30', 'precision: 0.8', 'recall: 0.7272727272727273', 'F1: 0.761904761904762']
9
+ model_2.0.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
10
+ model_2.1.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.7878787878787878', 'F1: 0.8253968253968254']
11
+
12
+
13
+ model_1.0.bin ['samples: 292', 'precision: 0.5787671232876712', 'recall: 0.48011363636363635', 'F1: 0.5248447204968945']
14
+ model_1.2.bin ['samples: 292', 'precision: 0.636986301369863', 'recall: 0.5284090909090909', 'F1: 0.577639751552795']
15
+ model_1.4.bin ['samples: 292', 'precision: 0.7191780821917808', 'recall: 0.5965909090909091', 'F1: 0.6521739130434782']
16
+ model_1.5.bin ['samples: 292', 'precision: 0.6815068493150684', 'recall: 0.5653409090909091', 'F1: 0.6180124223602484']
17
+ model_1.6.bin ['samples: 292', 'precision: 0.726027397260274', 'recall: 0.6022727272727273', 'F1: 0.6583850931677019']
18
+ model_1.7.bin ['samples: 292', 'precision: 0.7363013698630136', 'recall: 0.6107954545454546', 'F1: 0.6677018633540373']
19
+ model_1.8.bin ['samples: 292', 'precision: 0.7431506849315068', 'recall: 0.6164772727272727', 'F1: 0.6739130434782609']
20
+ model_1.9.bin ['samples: 292', 'precision: 0.7773972602739726', 'recall: 0.6448863636363636', 'F1: 0.7049689440993789']
21
+ model_2.0.bin ['samples: 292', 'precision: 0.7636986301369864', 'recall: 0.6335227272727273', 'F1: 0.6925465838509317']
22
+ model_2.1.bin ['samples: 292', 'precision: 0.7671232876712328', 'recall: 0.6363636363636364', 'F1: 0.6956521739130435']
23
+
24
+
25
+ model_1.0.bin ['samples: 322', 'precision: 0.5962732919254659', 'recall: 0.4987012987012987', 'F1: 0.5431400282885432']
26
+ model_1.2.bin ['samples: 322', 'precision: 0.65527950310559', 'recall: 0.548051948051948', 'F1: 0.5968882602545968']
27
+ model_1.4.bin ['samples: 322', 'precision: 0.7267080745341615', 'recall: 0.6077922077922078', 'F1: 0.6619519094766619']
28
+ model_1.5.bin ['samples: 322', 'precision: 0.6956521739130435', 'recall: 0.5818181818181818', 'F1: 0.6336633663366337']
29
+ model_1.6.bin ['samples: 322', 'precision: 0.7422360248447205', 'recall: 0.6207792207792208', 'F1: 0.6760961810466761']
30
+ model_1.7.bin ['samples: 322', 'precision: 0.7484472049689441', 'recall: 0.625974025974026', 'F1: 0.6817538896746819']
31
+ model_1.8.bin ['samples: 322', 'precision: 0.7484472049689441', 'recall: 0.625974025974026', 'F1: 0.6817538896746819']
32
+ model_1.9.bin ['samples: 322', 'precision: 0.7795031055900621', 'recall: 0.6519480519480519', 'F1: 0.71004243281471']
33
+ model_2.0.bin ['samples: 322', 'precision: 0.7701863354037267', 'recall: 0.6441558441558441', 'F1: 0.7015558698727016']
34
+ model_2.1.bin ['samples: 322', 'precision: 0.7763975155279503', 'recall: 0.6493506493506493', 'F1: 0.7072135785007072']
35
+
36
+
37
+ =========================================================非重复验证集==================================================
38
+
39
+ model_1.2.bin ['samples: 303', 'precision: 0.6435643564356436', 'recall: 0.5342465753424658', 'F1: 0.5838323353293414']
40
+ model_1.4.bin ['samples: 303', 'precision: 0.7161716171617162', 'recall: 0.5945205479452055', 'F1: 0.6497005988023953']
41
+ model_1.5.bin ['samples: 303', 'precision: 0.6864686468646864', 'recall: 0.5698630136986301', 'F1: 0.6227544910179641']
42
+ model_1.6.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.6082191780821918', 'F1: 0.6646706586826348']
43
+ model_1.7.bin ['samples: 303', 'precision: 0.7425742574257426', 'recall: 0.6164383561643836', 'F1: 0.6736526946107784']
44
+ model_1.8.bin ['samples: 303', 'precision: 0.7392739273927392', 'recall: 0.6136986301369863', 'F1: 0.6706586826347306']
45
+ model_1.9.bin ['samples: 303', 'precision: 0.7722772277227723', 'recall: 0.6410958904109589', 'F1: 0.7005988023952096']
46
+ model_2.0.bin ['samples: 303', 'precision: 0.759075907590759', 'recall: 0.6301369863013698', 'F1: 0.688622754491018']
47
+ model_2.1.bin ['samples: 303', 'precision: 0.7623762376237624', 'recall: 0.6328767123287671', 'F1: 0.6916167664670658']
48
+ model_2.2.bin ['samples: 303', 'precision: 0.7458745874587459', 'recall: 0.6191780821917808', 'F1: 0.6766467065868264']
49
+
50
+ =================================================非重复验证集+5分标签==================================================
51
+
52
+ model_1.2.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
53
+ model_1.4.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.8125', 'F1: 0.8387096774193549']
54
+ model_1.5.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
55
+ model_1.6.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
56
+ model_1.7.bin ['samples: 30', 'precision: 0.8', 'recall: 0.75', 'F1: 0.7741935483870969']
57
+ model_1.8.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
58
+ model_1.9.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
59
+ model_2.0.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
60
+ model_2.1.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
61
+ model_2.2.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
62
+
63
+
64
+ model_1.2.bin ['samples: 302', 'precision: 0.6721854304635762', 'recall: 0.6444444444444445', 'F1: 0.6580226904376014']
65
+ model_1.4.bin ['samples: 302', 'precision: 0.7019867549668874', 'recall: 0.6730158730158731', 'F1: 0.6871961102106969']
66
+ model_1.5.bin ['samples: 302', 'precision: 0.7185430463576159', 'recall: 0.6888888888888889', 'F1: 0.7034035656401946']
67
+ model_1.6.bin ['samples: 302', 'precision: 0.7086092715231788', 'recall: 0.6793650793650794', 'F1: 0.6936790923824959']
68
+ model_1.7.bin ['samples: 302', 'precision: 0.7052980132450332', 'recall: 0.6761904761904762', 'F1: 0.6904376012965965']
69
+ model_1.8.bin ['samples: 302', 'precision: 0.7317880794701986', 'recall: 0.7015873015873015', 'F1: 0.7163695299837927']
70
+ model_1.9.bin ['samples: 302', 'precision: 0.7317880794701986', 'recall: 0.7015873015873015', 'F1: 0.7163695299837927']
71
+ model_2.0.bin ['samples: 302', 'precision: 0.7417218543046358', 'recall: 0.7111111111111111', 'F1: 0.7260940032414911']
72
+ model_2.1.bin ['samples: 302', 'precision: 0.7516556291390728', 'recall: 0.7206349206349206', 'F1: 0.7358184764991895']
73
+ model_2.2.bin ['samples: 302', 'precision: 0.7582781456953642', 'recall: 0.726984126984127', 'F1: 0.7423014586709886']
74
+
75
+
76
+ model_1.2.bin ['samples: 303', 'precision: 0.6732673267326733', 'recall: 0.6455696202531646', 'F1: 0.6591276252019386']
77
+ model_1.4.bin ['samples: 303', 'precision: 0.7029702970297029', 'recall: 0.6740506329113924', 'F1: 0.6882067851373183']
78
+ model_1.5.bin ['samples: 303', 'precision: 0.7194719471947195', 'recall: 0.689873417721519', 'F1: 0.7043618739903069']
79
+ model_1.6.bin ['samples: 303', 'precision: 0.7095709570957096', 'recall: 0.680379746835443', 'F1: 0.6946688206785137']
80
+ model_1.7.bin ['samples: 303', 'precision: 0.7062706270627063', 'recall: 0.6772151898734177', 'F1: 0.6914378029079159']
81
+ model_1.8.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.7025316455696202', 'F1: 0.7172859450726979']
82
+ model_1.9.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.7025316455696202', 'F1: 0.7172859450726979']
83
+ model_2.0.bin ['samples: 303', 'precision: 0.7425742574257426', 'recall: 0.7120253164556962', 'F1: 0.7269789983844911']
84
+ model_2.1.bin ['samples: 303', 'precision: 0.7524752475247525', 'recall: 0.7215189873417721', 'F1: 0.7366720516962842']
85
+ model_2.2.bin ['samples: 303', 'precision: 0.759075907590759', 'recall: 0.7278481012658228', 'F1: 0.7431340872374799']
86
+
87
+
88
+ model_1.2.bin ['samples: 425', 'precision: 0.6470588235294118', 'recall: 0.5456349206349206', 'F1: 0.5920344456404736']
89
+ model_1.2.bin ['samples: 425', 'precision: 0.691764705882353', 'recall: 0.6621621621621622', 'F1: 0.6766398158803222']
90
+ model_1.4.bin ['samples: 425', 'precision: 0.7129411764705882', 'recall: 0.6824324324324325', 'F1: 0.6973532796317606']
91
+ model_1.5.bin ['samples: 425', 'precision: 0.7294117647058823', 'recall: 0.6981981981981982', 'F1: 0.713463751438435']
92
+ model_1.6.bin ['samples: 425', 'precision: 0.7129411764705882', 'recall: 0.6824324324324325', 'F1: 0.6973532796317606']
93
+ model_1.7.bin ['samples: 425', 'precision: 0.7105882352941176', 'recall: 0.6801801801801802', 'F1: 0.6950517836593786']
94
+ model_1.8.bin ['samples: 425', 'precision: 0.7505882352941177', 'recall: 0.7184684684684685', 'F1: 0.7341772151898734']
95
+ model_1.9.bin ['samples: 425', 'precision: 0.7529411764705882', 'recall: 0.7207207207207207', 'F1: 0.7364787111622554']
96
+ model_2.0.bin ['samples: 425', 'precision: 0.7670588235294118', 'recall: 0.7342342342342343', 'F1: 0.7502876869965478']
97
+ model_2.1.bin ['samples: 425', 'precision: 0.7717647058823529', 'recall: 0.7387387387387387', 'F1: 0.7548906789413118']
98
+ model_2.2.bin ['samples: 425', 'precision: 0.7764705882352941', 'recall: 0.7432432432432432', 'F1: 0.7594936708860759']
99
+
100
+ model_1.2.bin ['samples: 447', 'precision: 0.6935123042505593', 'recall: 0.6623931623931624', 'F1: 0.6775956284153005']
101
+ model_1.4.bin ['samples: 447', 'precision: 0.7158836689038032', 'recall: 0.6837606837606838', 'F1: 0.6994535519125684']
102
+ model_1.5.bin ['samples: 447', 'precision: 0.7337807606263982', 'recall: 0.7008547008547008', 'F1: 0.7169398907103826']
103
+ model_1.6.bin ['samples: 447', 'precision: 0.7203579418344519', 'recall: 0.688034188034188', 'F1: 0.7038251366120218']
104
+ model_1.7.bin ['samples: 447', 'precision: 0.7158836689038032', 'recall: 0.6837606837606838', 'F1: 0.6994535519125684']
105
+ model_1.8.bin ['samples: 447', 'precision: 0.7539149888143176', 'recall: 0.7200854700854701', 'F1: 0.7366120218579234']
106
+ model_1.9.bin ['samples: 447', 'precision: 0.7539149888143176', 'recall: 0.7200854700854701', 'F1: 0.7366120218579234']
107
+ model_2.0.bin ['samples: 447', 'precision: 0.7695749440715883', 'recall: 0.7350427350427351', 'F1: 0.7519125683060108']
108
+ model_2.1.bin ['samples: 447', 'precision: 0.7718120805369127', 'recall: 0.7371794871794872', 'F1: 0.7540983606557377']
109
+ model_2.2.bin ['samples: 447', 'precision: 0.7785234899328859', 'recall: 0.7435897435897436', 'F1: 0.760655737704918']
110
+
111
+ model_1.2.bin
112
+ model_1.4.bin
113
+ model_1.5.bin
114
+ model_1.6.bin
115
+ model_1.7.bin
116
+ model_1.8.bin
117
+ model_1.9.bin
118
+ model_2.0.bin
119
+ model_2.1.bin
120
+ model_2.2.bin
121
+
122
+ model_1.2.bin
123
+ model_1.4.bin
124
+ model_1.5.bin
125
+ model_1.6.bin
126
+ model_1.7.bin
127
+ model_1.8.bin
128
+ model_1.9.bin
129
+ model_2.0.bin
130
+ model_2.1.bin
131
+ model_2.2.bin
132
+
133
+
134
+
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.1
2
+ Name: PgsFile
3
+ Version: 0.2.4
4
+ Summary: This module streamlines Python package management, script execution, file handling, web scraping, multimedia downloads, data cleaning, and NLP tasks such as word tokenization and POS tagging. It also assists with generating word lists and plotting data, making these tasks more accessible and convenient for literary students. Whether you need to scrape data from websites, clean text, or analyze language, this module provides user-friendly tools to simplify your workflow.
5
+ Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
6
+ Author: Pan Guisheng
7
+ Author-email: 895284504@qq.com
8
+ License: Educational free
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: Free For Educational Use
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: chardet
16
+ Requires-Dist: pandas
17
+ Requires-Dist: python-docx
18
+ Requires-Dist: pip
19
+ Requires-Dist: requests
20
+ Requires-Dist: fake-useragent
21
+ Requires-Dist: lxml
22
+ Requires-Dist: pimht
23
+ Requires-Dist: pysbd
24
+ Requires-Dist: nlpir-python
25
+ Requires-Dist: pillow
26
+
27
+ Purpose: This module is designed to make complex tasks accessible and convenient, even for beginners. By providing a unified set of tools, it simplifies the workflow for data collection, processing, and analysis. Whether you're scraping data from the web, cleaning text, or performing NLP tasks, this module ensures you can focus on your research without getting bogged down by technical challenges.
28
+
29
+ Key Features:
30
+ 1. Web Scraping: Easily scrape data from websites and download multimedia content.
31
+ 2. Package Management: Install, uninstall, and manage Python packages with simple commands.
32
+ 3. Data Retrieval: Extract data from various file formats like text, JSON, TSV, Excel, and HTML (both online and offline).
33
+ 4. Data Storage: Write and append data to text files, Excel, JSON, and JSON lines.
34
+ 5. File and Folder Processing: Manage file paths, create directories, move or copy files, and search for files with specific keywords.
35
+ 6. Data Cleaning: Clean text, handle punctuation, remove stopwords, and prepare data for analysis.
36
+ 7. NLP: Perform tokenization, generate n-grams, and create word lists for text analysis.
37
+ 8. Math Operations: Format numbers, convert decimals to percentages, and validate data.
38
+ 9. Visualization: Process images (e.g., make white pixels transparent) and manage fonts for rendering text.
39
+
40
+ Author: Pan Guisheng, a PhD student at the Graduate Institute of Interpretation and Translation of Shanghai International Studies University
41
+ E-mail: 895284504@qq.com
@@ -1,5 +1,5 @@
1
- PgsFile/PgsFile.py,sha256=BQNZBrBYgyB_4TVxD0CJ6cMpiaaDzL9b7c7kcYtxmwQ,83682
2
- PgsFile/__init__.py,sha256=-Vy1SIh-BYopiEan-EjBtwqZsNteNrOqkws7hUj1d2w,2378
1
+ PgsFile/PgsFile.py,sha256=V_Pnn5hljeR9xYQ8hyUAmf92140N4ORoQOe-cdBHJos,101212
2
+ PgsFile/__init__.py,sha256=C-uX4tN3J3L5Zr6r8qQx0zwNaG0UXTowxK_K0pd0Jt4,2680
3
3
  PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
4
4
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
5
5
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2569,6 +2569,7 @@ PgsFile/Corpora/Parallel/TED_EC_2017-2020/YvetteAlberdingkThijm_2017X公民影
2569
2569
  PgsFile/Corpora/Parallel/TED_EC_2017-2020/ZacharyRWood_2018为什么持有不同意见的人值得被聆听..txt,sha256=4SFYMhlFSHP2aEVvNS1CBeogq0D2lPTE5VhFsZjlZnM,9546
2570
2570
  PgsFile/Corpora/Parallel/TED_EC_2017-2020/ZeynepTufekci_2017G为了让人们点击广告_我们正在建造一个反乌托邦..txt,sha256=S3BSXKsNAX0ugVqBPhmJyaRF8MYAHapDMR12DoBYZgc,32353
2571
2571
  PgsFile/Corpora/Parallel/Xi's Speech_CE_2021/Speech at a Ceremony Marking the Centenary of the CPC.txt,sha256=3suCjs2LF2_Endg2i_hc3GX1N8lTBORlqpMWEKsXFeM,54282
2572
+ PgsFile/Corpora/Stopwords/NLPIR.user,sha256=DykLJdr8_cVHrdCnDJES1O5dgmnYqfaSO1_dtAVKYJk,3356
2572
2573
  PgsFile/Corpora/Stopwords/arabic.txt,sha256=yL9id0vdNF20WEvM0buRnRt1ByEeRGJuGDiY3jE7tlQ,1287
2573
2574
  PgsFile/Corpora/Stopwords/bulgarian.txt,sha256=eiIwYk1TU8YcYYPbMPjUzZSZlgd7gl5o7d0LIthzqHQ,2409
2574
2575
  PgsFile/Corpora/Stopwords/catalan.txt,sha256=8OyAOBHfWsEvKuLEphCfdiWhuxyVg1sOWV5gi2DJLwY,699
@@ -2599,6 +2600,7 @@ PgsFile/Corpora/Stopwords/turkish.txt,sha256=uGUvjEm2GR8PuVY_JeHNxhD7cWlNlF7vc3V
2599
2600
  PgsFile/Corpora/Stopwords/ukrainian.txt,sha256=fEzWLTwnWJriILkO-5jSfE2SpqY-GPf_kR4zid3MFUI,4131
2600
2601
  PgsFile/Corpora/Stopwords/vietnamese.txt,sha256=88yRtVMaRSFqas1iGGa6kOGDCZTgtzRPmR3q9dHshdc,20485
2601
2602
  PgsFile/Corpora/Terminology/Chinese_Thought.json,sha256=CdkuF2wLaDC5V3sRefcU1RZwXm4-wTZ-Qfk8r7gsu8I,2301866
2603
+ PgsFile/models/NLPIR.user,sha256=DykLJdr8_cVHrdCnDJES1O5dgmnYqfaSO1_dtAVKYJk,3356
2602
2604
  PgsFile/models/czech.pickle,sha256=W6c9KTx9eVOVa88C82lexcHw1Sfyo8OAl_VZM5T6FpA,1265552
2603
2605
  PgsFile/models/danish.pickle,sha256=6il2CgqRl_UspZ54rq_FpvVdBSWPr32xcJsrnrMh7yA,1264725
2604
2606
  PgsFile/models/dutch.pickle,sha256=So4ms9aMRcOOWU0Z4tVndEe_3KpjbTsees_tDpJy1zw,742624
@@ -2610,6 +2612,8 @@ PgsFile/models/german.pickle,sha256=6rSX-ghUExMMj9D7E7kpEokwr-L2om6ocVyV33CI6Xw,
2610
2612
  PgsFile/models/greek.pickle,sha256=IXUqZ2L61c_kb7XEX62ahUhKDo6Bxn5q9vuXPPwn1nw,1953106
2611
2613
  PgsFile/models/italian.pickle,sha256=3LJxfXvl8m6GCpLgWs9psRI6X0UnzXommpq56eZoyAU,658331
2612
2614
  PgsFile/models/malayalam.pickle,sha256=H4z1isvbf0cqxAr_wTZjvkLa-0fBUDDBGt4ERMng5T0,221207
2615
+ PgsFile/models/model_reviews2.2.bin,sha256=D6uL8KZIxD0rfWjH0kYEb7z_HE4aTJXpj82HzsCOpuk,1943196
2616
+ PgsFile/models/model_reviews_ReadMe.txt,sha256=Q9uLJwudMmsTKfd11l1tOcIP8lwsemIwnAVJG_3SYjU,11433
2613
2617
  PgsFile/models/norwegian.pickle,sha256=5Kl_j5oDoDON10a8yJoK4PVK5DuDX6N9g-J54cp5T68,1259779
2614
2618
  PgsFile/models/polish.pickle,sha256=FhJ7bRCTNCej6Q-yDpvlPh-zcf95pzDBAwc07YC5DJI,2042451
2615
2619
  PgsFile/models/portuguese.pickle,sha256=uwG_fHmk6twheLvSCWZROaDks48tHET-8Jfek5VRQOA,649051
@@ -2618,8 +2622,14 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
2618
2622
  PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
2619
2623
  PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
2620
2624
  PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
2621
- PgsFile-0.2.2.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
- PgsFile-0.2.2.dist-info/METADATA,sha256=1fm2uh-uYKgDe26DvUGCmj2LbMcjwDum113nbmW-MIA,5070
2623
- PgsFile-0.2.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
- PgsFile-0.2.2.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
- PgsFile-0.2.2.dist-info/RECORD,,
2625
+ PgsFile/models/fonts/DejaVuSans.ttf,sha256=faGVp0xVvvmI0NSPlQi9XYSUJcF3Dbpde_xs6e2EiVQ,757076
2626
+ PgsFile/models/fonts/书体坊赵九江钢笔行书体.ttf,sha256=fTOv4FFMnYtN1zCZghJ6-P1pzznA5qqoujwpDFY63Ek,3140656
2627
+ PgsFile/models/fonts/全新硬笔楷书简.ttf,sha256=mPemGYMpgQxvFL1pFjjnyUMIprHzcoOaw8oeZQ4k1x0,2397296
2628
+ PgsFile/models/fonts/全新硬笔行书简.ttf,sha256=bUtbl71eK_ellp1z0tCmmR_P-JhqVFIpzeuRlrEBo9g,2611516
2629
+ PgsFile/models/fonts/博洋行书3500.TTF,sha256=VrgeHr8cgOL6JD05QyuD9ZSyw4J2aIVxKxW8zSajq6Q,4410732
2630
+ PgsFile/models/fonts/陆柬之行书字体.ttf,sha256=Zpd4Z7E9w-Qy74yklXHk4vM7HOtHuQgllvygxZZ1Hvs,1247288
2631
+ PgsFile-0.2.4.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2632
+ PgsFile-0.2.4.dist-info/METADATA,sha256=JC1a8Xrh3tDt5-HNnCJY3V4tNYKstu83V2qo_FqkATY,2711
2633
+ PgsFile-0.2.4.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2634
+ PgsFile-0.2.4.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2635
+ PgsFile-0.2.4.dist-info/RECORD,,
@@ -1,79 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: PgsFile
3
- Version: 0.2.2
4
- Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, NLP tasks like Chinese word tokenization and POS tagging, and word list generation for literary students, making it more accessible and convenient to use.
5
- Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
6
- Author: Pan Guisheng
7
- Author-email: 895284504@qq.com
8
- License: Educational free
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: Free For Educational Use
11
- Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.8
13
- Description-Content-Type: text/markdown
14
- License-File: LICENSE
15
- Requires-Dist: chardet
16
- Requires-Dist: pandas
17
- Requires-Dist: python-docx
18
- Requires-Dist: pip
19
- Requires-Dist: requests
20
- Requires-Dist: fake-useragent
21
- Requires-Dist: lxml
22
- Requires-Dist: pimht
23
- Requires-Dist: pysbd
24
- Requires-Dist: nlpir-python
25
-
26
- Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
27
-
28
-
29
- Function 1: Enables efficient data retrieval and storage in files with a single line of code.
30
-
31
- Function 2: Facilitates retrieval of all absolute file paths and file names in any folder (including sub-folders) with a single line of code using "FilePath" and "FileName" functions.
32
-
33
- Function 3: Simplifies creation of word lists and frequency sorting from a file or batch of files using "word_list" and "batch_word_list" functions in PgsFile.
34
-
35
- Function 4: Pgs-Corpora is a comprehensive language resource included in this library, featuring a monolingual corpus of native and translational Chinese and native and non-native English, as well as a bi-directional parallel corpus of Chinese and English texts covering financial, legal, political, academic, and sports news topics. Additionally, the library includes a collection of 8774 English idioms, stopwords for 28 languages, and a termbank of Chinese thought and culture.
36
-
37
- Function 5: This library provides support for common text cleaning tasks, such as removing empty text, empty lines, and folders containing empty text. It also offers functions for converting full-width characters to half-width characters and vice versa, as well as standardizing the format of Chinese and English punctuation. These features can help improve the quality and consistency of text data used in natural language processing tasks.
38
-
39
- Function 6: It also manages Python package installations and uninstallations, and allows running scripts and commands in Python interactive command lines instead of Windows command prompt.
40
-
41
- Function 7: Download audiovisual files like videos, images, and audio using audiovisual_downloader, which is extremely useful and efficient. Additionally, scrape newspaper data with PGScraper, a highly efficient tool for this purpose.
42
-
43
- Table 1: The directory and size of Pgs-Corpora
44
- ├── Idioms (1, 171.78 KB)
45
- ├── Monolingual (2197, 63.65 MB)
46
- │ ├── Chinese (456, 15.27 MB)
47
- │ │ ├── People's Daily 20130605 (396, 1.38 MB)
48
- │ │ │ ├── Raw (132, 261.73 KB)
49
- │ │ │ ├── Seg_only (132, 471.47 KB)
50
- │ │ │ └── Tagged (132, 675.30 KB)
51
- │ │ └── Translational Fictions (60, 13.89 MB)
52
- │ └── English (1741, 48.38 MB)
53
- │ ├── Native (65, 44.14 MB)
54
- │ │ ├── A Short Collection of British Fiction (27, 33.90 MB)
55
- │ │ └── Preschoolers- and Teenagers-oriented Texts in English (36, 10.24 MB)
56
- │ ├── Non-native (1675, 3.63 MB)
57
- │ │ └── Shanghai Daily (1675, 3.63 MB)
58
- │ │ └── Business_2019 (1675, 3.63 MB)
59
- │ │ ├── 2019-01-01 (1, 3.35 KB)
60
- │ │ ├── 2019-01-02 (1, 3.65 KB)
61
- │ │ ├── 2019-01-03 (7, 10.90 KB)
62
- │ │ ├── 2019-01-04 (5, 9.63 KB)
63
- │ │ └── 2019-01-07 (4, 9.50 KB)
64
- │ │ └── ... (and 245 more directories)
65
- │ └── Translational (1, 622.57 KB)
66
- ├── Parallel (371, 24.67 MB)
67
- │ ├── HK Financial and Legal EC Parallel Corpora (5, 19.17 MB)
68
- │ ├── New Year Address_CE_2006-2021 (15, 147.49 KB)
69
- │ ├── Sports News_CE_2010 (20, 66.42 KB)
70
- │ ├── TED_EC_2017-2020 (330, 5.24 MB)
71
- │ └── Xi's Speech_CE_2021 (1, 53.01 KB)
72
- ├── Stopwords (28, 88.09 KB)
73
- └── Terminology (1, 2.20 MB)
74
-
75
- ...
76
-
77
-
78
- Author: Pan Guisheng, a PhD student at the Graduate Institute of Interpretation and Translation of Shanghai International Studies University
79
- E-mail: 895284504@qq.com