PgsFile 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

Files changed (17) hide show
  1. PgsFile/PgsFile.py +415 -35
  2. PgsFile/__init__.py +9 -3
  3. PgsFile/models/NLPIR.user +0 -0
  4. PgsFile/models/fonts/DejaVuSans.ttf +0 -0
  5. PgsFile/models/fonts//321/204/342/225/243/320/266/321/204/342/225/234/320/243/321/205/320/255/320/232/321/210/342/225/241/342/225/241/321/204/342/225/243/320/255/321/206/342/226/222/320/257/321/211/320/242/320/262/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/204/342/225/234/320/243.ttf +0 -0
  6. PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/206/320/265/342/225/226/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
  7. PgsFile/models/fonts//321/205/320/225/320/270/321/206/320/246/342/226/221/321/207/320/261/320/274/321/207/320/274/320/244/321/210/320/261/320/234/321/204/342/225/243/320/266/321/207/320/276/320/220.ttf +0 -0
  8. PgsFile/models/fonts//321/205/320/235/320/252/321/206/342/224/244/320/233/321/210/320/261/320/234/321/204/342/225/243/320/2663500.TTF +0 -0
  9. PgsFile/models/fonts//321/211/320/251/320/226/321/206/320/257/320/274/321/204/342/225/243/320/233/321/210/320/261/320/234/321/204/342/225/243/320/266/321/205/320/275/320/247/321/204/342/225/234/320/243.ttf +0 -0
  10. PgsFile/models/model_reviews2.2.bin +0 -0
  11. PgsFile/models/model_reviews_ReadMe.txt +134 -0
  12. PgsFile-0.2.4.dist-info/METADATA +41 -0
  13. {PgsFile-0.2.3.dist-info → PgsFile-0.2.4.dist-info}/RECORD +16 -7
  14. PgsFile-0.2.3.dist-info/METADATA +0 -79
  15. {PgsFile-0.2.3.dist-info → PgsFile-0.2.4.dist-info}/LICENSE +0 -0
  16. {PgsFile-0.2.3.dist-info → PgsFile-0.2.4.dist-info}/WHEEL +0 -0
  17. {PgsFile-0.2.3.dist-info → PgsFile-0.2.4.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py CHANGED
@@ -103,7 +103,7 @@ def get_data_text(path):
103
103
  else:
104
104
  return None
105
105
 
106
- def get_data_lines(path):
106
+ def get_data_lines(path, no_line_breaks=False):
107
107
  '''
108
108
  Parameters
109
109
  ----------
@@ -133,7 +133,10 @@ def get_data_lines(path):
133
133
  # Read the entire file using the detected encoding
134
134
  if encoding:
135
135
  with open(path, 'r', encoding=encoding, errors="ignore") as f:
136
- lines = [l.strip() for l in f.readlines() if len(l.strip()) != 0]
136
+ if no_line_breaks is False:
137
+ lines = [l.strip() for l in f.readlines() if len(l.strip()) != 0]
138
+ else:
139
+ lines = f.readlines()
137
140
  return lines
138
141
  else:
139
142
  return None
@@ -197,15 +200,15 @@ def get_data_excel(excel_path,column_id,sheet_name=None):
197
200
  inter=df.iloc[0:,column_id] #提取第二列所有行
198
201
  return list(inter)
199
202
 
200
- def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
203
+ def write_to_excel(excel_path, data, sheet_name=None, index=None):
201
204
  '''
202
205
  Parameters
203
206
  ----------
204
207
  excel_path : TYPE
205
208
  DESCRIPTION. results.xlsx
206
209
 
207
- dic_of_list : TYPE
208
- DESCRIPTION. {"col":["a","b","c","d"],"freq":[1,2,3,4]}
210
+ data : TYPE, dict
211
+ DESCRIPTION. data = {'翻译': 24, '教学': 8, '数智': 6, '时代': 6, '财经': 6, '新': 4}
209
212
 
210
213
  sheet_name : TYPE, optional
211
214
  DESCRIPTION. The default is None.
@@ -227,6 +230,10 @@ def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
227
230
  index=False
228
231
  else:
229
232
  index=True
233
+
234
+ col = list(data.keys())
235
+ freq = list(data.values())
236
+ dic_of_list={"items": col, "counts": freq}
230
237
 
231
238
  df=pd.DataFrame(dic_of_list)
232
239
  df.style.to_excel(excel_path, sheet_name=sheet_name,startcol=0, index=index)
@@ -471,6 +478,18 @@ def get_directory_tree_with_meta(start_path, indent='', show_meta=False, max_dir
471
478
  print(f"{indent}└── ... (and {remaining_directories} more directories)")
472
479
  # current_level=-1 will show all folders' info.
473
480
 
481
+ def get_full_path(*path_components):
482
+ """
483
+ Combines multiple path components into a single, full path using os.path.join.
484
+
485
+ Args:
486
+ *path_components: Variable number of path components (strings).
487
+
488
+ Returns:
489
+ str: The combined full path.
490
+ """
491
+ return os.path.join(*path_components)
492
+
474
493
  def get_subfolder_path(parent_folder, subfolder_name):
475
494
  import os
476
495
  subfolder_name=subfolder_name.strip()
@@ -553,7 +572,6 @@ def batch_word_list(input_root):
553
572
  sorted_words=sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
554
573
  return sorted_words
555
574
 
556
-
557
575
  def clean_list(meta):
558
576
  """
559
577
  Parameters
@@ -576,7 +594,6 @@ def clean_list(meta):
576
594
 
577
595
  yhd=["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)','Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0','Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)','Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5','Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1','Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13','Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6','NOKIA5700/ UCWEB7.0.2.37/28/999','Openwave/ UCWEB7.0.2.37/28/999','Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10','Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11','Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11','UCWEB7.0.2.37/28/999']
578
596
 
579
-
580
597
  def source_path(relative_path):
581
598
  import sys,os
582
599
  if getattr(sys, 'frozen', False):
@@ -590,7 +607,6 @@ def next_folder_names(folder):
590
607
  folder_namelist=next(os.walk(folder))[1]
591
608
  return folder_namelist
592
609
 
593
-
594
610
  def remove_empty_txts(folder_path):
595
611
  import os
596
612
  files=FilePath(folder_path)
@@ -797,7 +813,6 @@ def cs1(text):
797
813
  sentences=sentences
798
814
  return sentences
799
815
 
800
-
801
816
  def word_tokenize(text, pos_tagged=False):
802
817
  '''
803
818
  Parameters
@@ -838,7 +853,7 @@ def word_tokenize(text, pos_tagged=False):
838
853
  else:
839
854
  try:
840
855
  if "Can not open" in str(err):
841
- user_folder=get_library_location("PgsFile")+"/PgsFile/Corpora/Stopwords"
856
+ user_folder=get_library_location("PgsFile")+"/PgsFile/models"
842
857
  destination_folder=get_library_location("nlpir-python")+"/nlpir/Data"
843
858
  source_file=find_user_files_in_upper_folder(user_folder, "NLPIR")[0]
844
859
  copy_file(source_file, destination_folder)
@@ -850,6 +865,296 @@ def word_tokenize(text, pos_tagged=False):
850
865
 
851
866
  return words
852
867
 
868
+ import re
869
+ from abc import ABC, abstractmethod
870
+ from typing import Iterator, List, Tuple
871
+ class TokenizerI(ABC):
872
+ """
873
+ A processing interface for tokenizing a string.
874
+ Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
875
+ """
876
+
877
+ @abstractmethod
878
+ def tokenize(self, s: str) -> List[str]:
879
+ """
880
+ Return a tokenized copy of *s*.
881
+
882
+ :rtype: List[str]
883
+ """
884
+ if overridden(self.tokenize_sents):
885
+ return self.tokenize_sents([s])[0]
886
+
887
+ def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
888
+ """
889
+ Identify the tokens using integer offsets ``(start_i, end_i)``,
890
+ where ``s[start_i:end_i]`` is the corresponding token.
891
+
892
+ :rtype: Iterator[Tuple[int, int]]
893
+ """
894
+ raise NotImplementedError()
895
+
896
+ def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
897
+ """
898
+ Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
899
+
900
+ return [self.tokenize(s) for s in strings]
901
+
902
+ :rtype: List[List[str]]
903
+ """
904
+ return [self.tokenize(s) for s in strings]
905
+
906
+ def span_tokenize_sents(
907
+ self, strings: List[str]
908
+ ) -> Iterator[List[Tuple[int, int]]]:
909
+ """
910
+ Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
911
+
912
+ return [self.span_tokenize(s) for s in strings]
913
+
914
+ :yield: List[Tuple[int, int]]
915
+ """
916
+ for s in strings:
917
+ yield list(self.span_tokenize(s))
918
+
919
+ class MacIntyreContractions:
920
+ """
921
+ List of contractions adapted from Robert MacIntyre's tokenizer.
922
+ """
923
+
924
+ CONTRACTIONS2 = [
925
+ r"(?i)\b(can)(?#X)(not)\b",
926
+ r"(?i)\b(d)(?#X)('ye)\b",
927
+ r"(?i)\b(gim)(?#X)(me)\b",
928
+ r"(?i)\b(gon)(?#X)(na)\b",
929
+ r"(?i)\b(got)(?#X)(ta)\b",
930
+ r"(?i)\b(lem)(?#X)(me)\b",
931
+ r"(?i)\b(more)(?#X)('n)\b",
932
+ r"(?i)\b(wan)(?#X)(na)(?=\s)",
933
+ ]
934
+ CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
935
+ CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
936
+
937
+ class NLTKWordTokenizer(TokenizerI):
938
+ """
939
+ The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
940
+
941
+ This is the method that is invoked by ``word_tokenize()``. It assumes that the
942
+ text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
943
+
944
+ The tokenizer is "destructive" such that the regexes applied will munge the
945
+ input string to a state beyond re-construction. It is possible to apply
946
+ `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
947
+ `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
948
+ revert to the original string.
949
+ """
950
+
951
+ # Starting quotes.
952
+ STARTING_QUOTES = [
953
+ (re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
954
+ (re.compile(r"^\""), r"``"),
955
+ (re.compile(r"(``)"), r" \1 "),
956
+ (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
957
+ (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
958
+ ]
959
+
960
+ # Ending quotes.
961
+ ENDING_QUOTES = [
962
+ (re.compile("([»”’])", re.U), r" \1 "),
963
+ (re.compile(r"''"), " '' "),
964
+ (re.compile(r'"'), " '' "),
965
+ (re.compile(r"\s+"), " "),
966
+ (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
967
+ (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
968
+ ]
969
+
970
+ # For improvements for starting/closing quotes from TreebankWordTokenizer,
971
+ # see discussion on https://github.com/nltk/nltk/pull/1437
972
+ # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
973
+ # - chevron quotes u'\xab' and u'\xbb'
974
+ # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
975
+ # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
976
+ # Also, behavior of splitting on clitics now follows Stanford CoreNLP
977
+ # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
978
+
979
+ # Punctuation.
980
+ PUNCTUATION = [
981
+ (re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
982
+ (re.compile(r"([:,])([^\d])"), r" \1 \2"),
983
+ (re.compile(r"([:,])$"), r" \1 "),
984
+ (
985
+ re.compile(r"\.{2,}", re.U),
986
+ r" \g<0> ",
987
+ ), # See https://github.com/nltk/nltk/pull/2322
988
+ (re.compile(r"[;@#$%&]"), r" \g<0> "),
989
+ (
990
+ re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
991
+ r"\1 \2\3 ",
992
+ ), # Handles the final period.
993
+ (re.compile(r"[?!]"), r" \g<0> "),
994
+ (re.compile(r"([^'])' "), r"\1 ' "),
995
+ (
996
+ re.compile(r"[*]", re.U),
997
+ r" \g<0> ",
998
+ ), # See https://github.com/nltk/nltk/pull/2322
999
+ ]
1000
+
1001
+ # Pads parentheses
1002
+ PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
1003
+
1004
+ # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
1005
+ CONVERT_PARENTHESES = [
1006
+ (re.compile(r"\("), "-LRB-"),
1007
+ (re.compile(r"\)"), "-RRB-"),
1008
+ (re.compile(r"\["), "-LSB-"),
1009
+ (re.compile(r"\]"), "-RSB-"),
1010
+ (re.compile(r"\{"), "-LCB-"),
1011
+ (re.compile(r"\}"), "-RCB-"),
1012
+ ]
1013
+
1014
+ DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
1015
+
1016
+ # List of contractions adapted from Robert MacIntyre's tokenizer.
1017
+ _contractions = MacIntyreContractions()
1018
+ CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
1019
+ CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
1020
+
1021
+ def tokenize(
1022
+ self, text: str, convert_parentheses: bool = False, return_str: bool = False
1023
+ ) -> List[str]:
1024
+ r"""Return a tokenized copy of `text`.
1025
+
1026
+ >>> from nltk.tokenize import NLTKWordTokenizer
1027
+ >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
1028
+ >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
1029
+ ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
1030
+ 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
1031
+ 'of', 'them.', 'Thanks', '.']
1032
+ >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
1033
+ ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
1034
+ 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
1035
+ 'of', 'them.', 'Thanks', '.']
1036
+
1037
+
1038
+ :param text: A string with a sentence or sentences.
1039
+ :type text: str
1040
+ :param convert_parentheses: if True, replace parentheses to PTB symbols,
1041
+ e.g. `(` to `-LRB-`. Defaults to False.
1042
+ :type convert_parentheses: bool, optional
1043
+ :param return_str: If True, return tokens as space-separated string,
1044
+ defaults to False.
1045
+ :type return_str: bool, optional
1046
+ :return: List of tokens from `text`.
1047
+ :rtype: List[str]
1048
+ """
1049
+ if return_str:
1050
+ warnings.warn(
1051
+ "Parameter 'return_str' has been deprecated and should no "
1052
+ "longer be used.",
1053
+ category=DeprecationWarning,
1054
+ stacklevel=2,
1055
+ )
1056
+
1057
+ for regexp, substitution in self.STARTING_QUOTES:
1058
+ text = regexp.sub(substitution, text)
1059
+
1060
+ for regexp, substitution in self.PUNCTUATION:
1061
+ text = regexp.sub(substitution, text)
1062
+
1063
+ # Handles parentheses.
1064
+ regexp, substitution = self.PARENS_BRACKETS
1065
+ text = regexp.sub(substitution, text)
1066
+ # Optionally convert parentheses
1067
+ if convert_parentheses:
1068
+ for regexp, substitution in self.CONVERT_PARENTHESES:
1069
+ text = regexp.sub(substitution, text)
1070
+
1071
+ # Handles double dash.
1072
+ regexp, substitution = self.DOUBLE_DASHES
1073
+ text = regexp.sub(substitution, text)
1074
+
1075
+ # add extra space to make things easier
1076
+ text = " " + text + " "
1077
+
1078
+ for regexp, substitution in self.ENDING_QUOTES:
1079
+ text = regexp.sub(substitution, text)
1080
+
1081
+ for regexp in self.CONTRACTIONS2:
1082
+ text = regexp.sub(r" \1 \2 ", text)
1083
+ for regexp in self.CONTRACTIONS3:
1084
+ text = regexp.sub(r" \1 \2 ", text)
1085
+
1086
+ # We are not using CONTRACTIONS4 since
1087
+ # they are also commented out in the SED scripts
1088
+ # for regexp in self._contractions.CONTRACTIONS4:
1089
+ # text = regexp.sub(r' \1 \2 \3 ', text)
1090
+
1091
+ return text.split()
1092
+
1093
+ def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
1094
+ r"""
1095
+ Returns the spans of the tokens in ``text``.
1096
+ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
1097
+
1098
+ >>> from nltk.tokenize import NLTKWordTokenizer
1099
+ >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
1100
+ >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
1101
+ ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
1102
+ ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
1103
+ ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
1104
+ >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
1105
+ True
1106
+ >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
1107
+ ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
1108
+ ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
1109
+ >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
1110
+ True
1111
+
1112
+ :param text: A string with a sentence or sentences.
1113
+ :type text: str
1114
+ :yield: Tuple[int, int]
1115
+ """
1116
+ raw_tokens = self.tokenize(text)
1117
+
1118
+ # Convert converted quotes back to original double quotes
1119
+ # Do this only if original text contains double quote(s) or double
1120
+ # single-quotes (because '' might be transformed to `` if it is
1121
+ # treated as starting quotes).
1122
+ if ('"' in text) or ("''" in text):
1123
+ # Find double quotes and converted quotes
1124
+ matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
1125
+
1126
+ # Replace converted quotes back to double quotes
1127
+ tokens = [
1128
+ matched.pop(0) if tok in ['"', "``", "''"] else tok
1129
+ for tok in raw_tokens
1130
+ ]
1131
+ else:
1132
+ tokens = raw_tokens
1133
+
1134
+ yield from align_tokens(tokens, text)
1135
+
1136
+ # Standard word tokenizer.
1137
+ _treebank_word_tokenizer = NLTKWordTokenizer()
1138
+ def word_tokenize2(text, preserve_line=False):
1139
+ """
1140
+ Return a tokenized copy of *text*,
1141
+ using NLTK's recommended word tokenizer
1142
+ (currently an improved :class:`.TreebankWordTokenizer`
1143
+ along with :class:`.PunktSentenceTokenizer`
1144
+ for the specified language).
1145
+
1146
+ :param text: text to split into words
1147
+ :type text: str
1148
+ :param language: the model name in the Punkt corpus
1149
+ :type language: str
1150
+ :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
1151
+ :type preserve_line: bool
1152
+ """
1153
+ sentences = [text] if preserve_line else sent_tokenize(text)
1154
+ return [
1155
+ token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
1156
+ ]
1157
+
853
1158
  def pad_sequence(
854
1159
  sequence,
855
1160
  n,
@@ -889,9 +1194,7 @@ def pad_sequence(
889
1194
  sequence=chain(sequence, (right_pad_symbol,) * (n - 1))
890
1195
  return sequence
891
1196
 
892
-
893
1197
  # add a flag to pad the sequence so we get peripheral ngrams?
894
-
895
1198
  def ngrams(
896
1199
  sequence,
897
1200
  n,
@@ -954,7 +1257,6 @@ def ngrams(
954
1257
  yield tuple(history)
955
1258
  del history[0]
956
1259
 
957
-
958
1260
  def bigrams(sequence, **kwargs):
959
1261
  """
960
1262
  Return the bigrams generated from a sequence of items, as an iterator.
@@ -974,7 +1276,6 @@ def bigrams(sequence, **kwargs):
974
1276
  for item in ngrams(sequence, 2, **kwargs):
975
1277
  yield item
976
1278
 
977
-
978
1279
  def trigrams(sequence, **kwargs):
979
1280
  """
980
1281
  Return the trigrams generated from a sequence of items, as an iterator.
@@ -994,7 +1295,6 @@ def trigrams(sequence, **kwargs):
994
1295
  for item in ngrams(sequence, 3, **kwargs):
995
1296
  yield item
996
1297
 
997
-
998
1298
  def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
999
1299
  """
1000
1300
  Returns all possible ngrams generated from a sequence of items, as an iterator.
@@ -1148,6 +1448,18 @@ def uninstall_package(package_name: str):
1148
1448
  import pip
1149
1449
  pip.main(['uninstall', package_name, '-y'])
1150
1450
 
1451
+ # A list of conda configuration commands.
1452
+ conda_mirror_commands=[
1453
+ "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple", # Windows recommended
1454
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/", # MacOS recommended
1455
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/",
1456
+ "conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/",
1457
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/fastai/",
1458
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/",
1459
+ "conda config --append channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/",
1460
+ "pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/"
1461
+ ]
1462
+
1151
1463
  def DirList(root_dir: str) -> tuple:
1152
1464
  """
1153
1465
  List the contents of a directory and return two lists containing the names of the directories and files in the directory.
@@ -1251,10 +1563,8 @@ def sort_strings_with_embedded_numbers(strings: list) -> list:
1251
1563
  """
1252
1564
  # Sort the strings using the extract_numbers() function as the key
1253
1565
  sorted_strings=sorted(strings, key=extract_numbers)
1254
-
1255
1566
  return sorted_strings
1256
1567
 
1257
-
1258
1568
  def run_command(command: str) -> str:
1259
1569
  """
1260
1570
  Run a command and return its output as a string.
@@ -1444,7 +1754,6 @@ class PGScraper(object):
1444
1754
  return all_want_list
1445
1755
 
1446
1756
 
1447
-
1448
1757
 
1449
1758
  # -*- coding: utf-8 -*-
1450
1759
  """
@@ -1490,7 +1799,6 @@ class ProgressData(object):
1490
1799
  self.size, self.unit, progress, speed, self.unit))
1491
1800
  print('%50s'%('/'*int((1-progress)*50)))
1492
1801
 
1493
-
1494
1802
  def levenshtein_distance(s, t):
1495
1803
  m, n=len(s), len(t)
1496
1804
  if m < n:
@@ -1516,7 +1824,7 @@ pgs_abbres_words=['A.B.','A.D.','A.G.','A.I.','A.M.','A.P.','A.V.','AFP.','Ala.'
1516
1824
  def clean_text(text): #清洗除了句号以外的其他标点符号问题
1517
1825
  # 在标点符号右边邻接单词前添加空格
1518
1826
  import re
1519
- text=replace_chinese_punctuation_with_english(text)
1827
+ # text=replace_chinese_punctuation_with_english(text)
1520
1828
  text=re.sub(r'(?<=[\?\!\,\;\:\)\]\}])\s*(?=\w)', ' ', text)
1521
1829
  # 删除标点符号与左边单词之间的空格
1522
1830
  text=re.sub(r'\s*([\?\!\,\;\:\)\]\}\>])', r'\1', text)
@@ -1532,24 +1840,45 @@ def clean_text(text): #清洗除了句号以外的其他标点符号问题
1532
1840
 
1533
1841
  def clean_text_with_abbreviations(text):
1534
1842
  import re
1535
- text=clean_text(text)
1536
- matches=[]
1537
- for seg in text.split():
1843
+
1844
+ # 按行分割文本
1845
+ lines = text.splitlines()
1846
+
1847
+ # 清洗每一行
1848
+ cleaned_lines = []
1849
+ for line in lines:
1850
+ cleaned_line = clean_line_with_abbreviations(line)
1851
+ cleaned_lines.append(cleaned_line)
1852
+
1853
+ # 将清洗后的行重新组合成文本
1854
+ cleaned_text = '\n'.join(cleaned_lines)
1855
+ return cleaned_text
1856
+
1857
+ def clean_line_with_abbreviations(line):
1858
+ import re
1859
+
1860
+ # 清洗除了句号以外的其他标点符号问题
1861
+ line = clean_text(line)
1862
+
1863
+ matches = []
1864
+ for seg in line.split():
1538
1865
  if "." in seg:
1539
- if seg.endswith(".") is False:
1866
+ if not seg.endswith("."):
1540
1867
  matches.append(seg)
1541
1868
  elif seg.endswith("..") and "..." not in seg:
1542
- text=text.replace("..", ".")
1543
-
1869
+ line = line.replace("..", ".")
1870
+
1544
1871
  for match in matches:
1545
1872
  if any(word in match for word in pgs_abbres_words):
1546
- inter=match.split(".")
1547
- new_match="".join([w+"." for w in inter[0:-1]])+" "+inter[-1]
1548
- text=text.replace(match, new_match)
1873
+ inter = match.split(".")
1874
+ new_match = "".join([w + "." for w in inter[0:-1]]) + " " + inter[-1]
1875
+ line = line.replace(match, new_match)
1549
1876
  else:
1550
- text=text.replace(match, match.replace(".",". "))
1551
- text=re.sub(r'\s+\.', '.', text)
1552
- return text
1877
+ line = line.replace(match, match.replace(".", ". "))
1878
+
1879
+ line = re.sub(r'\s+\.', '.', line)
1880
+ return line
1881
+
1553
1882
 
1554
1883
  import shutil
1555
1884
  def move_file(source_file, destination_folder, new_file_name=None):
@@ -1597,7 +1926,6 @@ def copy_file(source_file, destination_folder, new_file_name=None):
1597
1926
 
1598
1927
  # Copy the file to the destination folder
1599
1928
  shutil.copy2(source_file, destination_file)
1600
-
1601
1929
 
1602
1930
  def check_empty_cells(file_path):
1603
1931
  """
@@ -1636,7 +1964,6 @@ def makefile(file_path):
1636
1964
  else:
1637
1965
  write_to_txt(file_path, "")
1638
1966
 
1639
-
1640
1967
  def save_dict_to_excel(data, output_file, headers=None):
1641
1968
  """
1642
1969
  Save Python dictionary data into an Excel .xlsx file with custom headers.
@@ -1845,4 +2172,57 @@ def get_stopwords(language=None):
1845
2172
  return en_stopwords
1846
2173
  else:
1847
2174
  lang_stopwords=get_data_lines(find_txt_files_with_keyword(stopwords_path, language)[0])
1848
- return lang_stopwords
2175
+ return lang_stopwords
2176
+
2177
+ from PIL import Image
2178
+ def replace_white_with_transparency(input_path, output_path):
2179
+ """
2180
+ This function opens an image, replaces all white pixels with transparent pixels.
2181
+
2182
+ Parameters:
2183
+ input_path (str): The path to the input image file.
2184
+ output_path (str): The path to save the output image file.
2185
+ """
2186
+ # 从RGB(24位)模式转成RGBA(32位)模式
2187
+ img = Image.open(input_path).convert('RGBA')
2188
+ W, L = img.size
2189
+ white_pixel = (0, 0, 0, 0) # white
2190
+ for h in range(W):
2191
+ for i in range(L):
2192
+ if img.getpixel((h, i)) == white_pixel:
2193
+ img.putpixel((h, i), (255, 255, 255, 0)) # make it transparent
2194
+ img.save(output_path)
2195
+
2196
+ def get_font_path(font_name=None):
2197
+ '''
2198
+ Retrieves the file path of a specified font.
2199
+
2200
+ Parameters
2201
+ ----------
2202
+ font_name : str, optional
2203
+ The name of the font file (must end with ".ttf"). If provided, it should match one of the available fonts in the library, such as:
2204
+ - 'DejaVuSans.ttf'
2205
+ - '书体坊赵九江钢笔行书体.ttf'
2206
+ - '全新硬笔楷书简.ttf'
2207
+ - '全新硬笔行书简.ttf'
2208
+ - '博洋行书3500.TTF'
2209
+ - '陆柬之行书字体.ttf'
2210
+ The default is None, which will return the path for 'DejaVuSans.ttf'.
2211
+
2212
+ Returns
2213
+ -------
2214
+ font_path : str
2215
+ The full file path of the specified font. If no font name is provided, the default path for 'DejaVuSans.ttf' will be returned.
2216
+ Example: "C:/Windows/Fonts/simhei.ttf"
2217
+ '''
2218
+
2219
+ font_folder = get_library_location("PgsFile") + "/PgsFile/models/fonts"
2220
+ if font_name is None:
2221
+ font_path = get_full_path(font_folder, "DejaVuSans.ttf")
2222
+ else:
2223
+ font_path = get_full_path(font_folder, font_name)
2224
+ return font_path
2225
+
2226
+ simhei_default_font_path_MacOS_Windows=["/System/Library/Fonts/STHeiti Medium.ttc",
2227
+ r"C:\Windows\Fonts\simhei.ttf", # Use a font that supports Chinese characters
2228
+ ]
PgsFile/__init__.py CHANGED
@@ -7,6 +7,7 @@ from .PgsFile import headers, encode_chinese_keyword_for_url
7
7
  from .PgsFile import install_package, uninstall_package
8
8
  from .PgsFile import run_script, run_command
9
9
  from .PgsFile import get_library_location
10
+ from .PgsFile import conda_mirror_commands
10
11
 
11
12
  # 3. Text data retrieval
12
13
  from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
@@ -19,7 +20,7 @@ from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_
19
20
 
20
21
  # 5. File/folder process
21
22
  from .PgsFile import FilePath, FileName, DirList
22
- from .PgsFile import get_subfolder_path
23
+ from .PgsFile import get_subfolder_path, get_full_path
23
24
  from .PgsFile import makedirec, makefile
24
25
  from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
25
26
  from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file, copy_file
@@ -32,18 +33,23 @@ from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, L
32
33
  from .PgsFile import check_contain_chinese, check_contain_number
33
34
  from .PgsFile import replace_chinese_punctuation_with_english
34
35
  from .PgsFile import replace_english_punctuation_with_chinese
35
- from .PgsFile import clean_list, clean_text_with_abbreviations
36
+ from .PgsFile import clean_list, clean_text, clean_text_with_abbreviations, clean_line_with_abbreviations
36
37
  from .PgsFile import extract_chinese_punctuation, generate_password, sort_strings_with_embedded_numbers
37
38
 
38
39
  # 7. NLP (natural language processing)
39
40
  from .PgsFile import strQ2B_raw, strQ2B_words
40
41
  from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
41
42
  from .PgsFile import word_list, batch_word_list
42
- from .PgsFile import cs, cs1, sent_tokenize, word_tokenize
43
+ from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
43
44
 
44
45
  # 8. Maths
45
46
  from .PgsFile import len_rows, check_empty_cells
46
47
  from .PgsFile import format_float, decimal_to_percent, Percentage
47
48
  from .PgsFile import get_text_length_kb, extract_numbers
48
49
 
50
+ # 9. Visualization
51
+ from .PgsFile import replace_white_with_transparency
52
+ from .PgsFile import simhei_default_font_path_MacOS_Windows
53
+ from .PgsFile import get_font_path
54
+
49
55
  name = "PgsFile"
Binary file
Binary file
Binary file
@@ -0,0 +1,134 @@
1
+ model_1.0.bin ['samples: 30', 'precision: 0.7666666666666667', 'recall: 0.696969696969697', 'F1: 0.7301587301587302']
2
+ model_1.2.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
3
+ model_1.4.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
4
+ model_1.5.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
5
+ model_1.6.bin ['samples: 30', 'precision: 0.9', 'recall: 0.8181818181818182', 'F1: 0.8571428571428572']
6
+ model_1.7.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.7878787878787878', 'F1: 0.8253968253968254']
7
+ model_1.8.bin ['samples: 30', 'precision: 0.8', 'recall: 0.7272727272727273', 'F1: 0.761904761904762']
8
+ model_1.9.bin ['samples: 30', 'precision: 0.8', 'recall: 0.7272727272727273', 'F1: 0.761904761904762']
9
+ model_2.0.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.7575757575757576', 'F1: 0.7936507936507938']
10
+ model_2.1.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.7878787878787878', 'F1: 0.8253968253968254']
11
+
12
+
13
+ model_1.0.bin ['samples: 292', 'precision: 0.5787671232876712', 'recall: 0.48011363636363635', 'F1: 0.5248447204968945']
14
+ model_1.2.bin ['samples: 292', 'precision: 0.636986301369863', 'recall: 0.5284090909090909', 'F1: 0.577639751552795']
15
+ model_1.4.bin ['samples: 292', 'precision: 0.7191780821917808', 'recall: 0.5965909090909091', 'F1: 0.6521739130434782']
16
+ model_1.5.bin ['samples: 292', 'precision: 0.6815068493150684', 'recall: 0.5653409090909091', 'F1: 0.6180124223602484']
17
+ model_1.6.bin ['samples: 292', 'precision: 0.726027397260274', 'recall: 0.6022727272727273', 'F1: 0.6583850931677019']
18
+ model_1.7.bin ['samples: 292', 'precision: 0.7363013698630136', 'recall: 0.6107954545454546', 'F1: 0.6677018633540373']
19
+ model_1.8.bin ['samples: 292', 'precision: 0.7431506849315068', 'recall: 0.6164772727272727', 'F1: 0.6739130434782609']
20
+ model_1.9.bin ['samples: 292', 'precision: 0.7773972602739726', 'recall: 0.6448863636363636', 'F1: 0.7049689440993789']
21
+ model_2.0.bin ['samples: 292', 'precision: 0.7636986301369864', 'recall: 0.6335227272727273', 'F1: 0.6925465838509317']
22
+ model_2.1.bin ['samples: 292', 'precision: 0.7671232876712328', 'recall: 0.6363636363636364', 'F1: 0.6956521739130435']
23
+
24
+
25
+ model_1.0.bin ['samples: 322', 'precision: 0.5962732919254659', 'recall: 0.4987012987012987', 'F1: 0.5431400282885432']
26
+ model_1.2.bin ['samples: 322', 'precision: 0.65527950310559', 'recall: 0.548051948051948', 'F1: 0.5968882602545968']
27
+ model_1.4.bin ['samples: 322', 'precision: 0.7267080745341615', 'recall: 0.6077922077922078', 'F1: 0.6619519094766619']
28
+ model_1.5.bin ['samples: 322', 'precision: 0.6956521739130435', 'recall: 0.5818181818181818', 'F1: 0.6336633663366337']
29
+ model_1.6.bin ['samples: 322', 'precision: 0.7422360248447205', 'recall: 0.6207792207792208', 'F1: 0.6760961810466761']
30
+ model_1.7.bin ['samples: 322', 'precision: 0.7484472049689441', 'recall: 0.625974025974026', 'F1: 0.6817538896746819']
31
+ model_1.8.bin ['samples: 322', 'precision: 0.7484472049689441', 'recall: 0.625974025974026', 'F1: 0.6817538896746819']
32
+ model_1.9.bin ['samples: 322', 'precision: 0.7795031055900621', 'recall: 0.6519480519480519', 'F1: 0.71004243281471']
33
+ model_2.0.bin ['samples: 322', 'precision: 0.7701863354037267', 'recall: 0.6441558441558441', 'F1: 0.7015558698727016']
34
+ model_2.1.bin ['samples: 322', 'precision: 0.7763975155279503', 'recall: 0.6493506493506493', 'F1: 0.7072135785007072']
35
+
36
+
37
+ =========================================================非重复验证集==================================================
38
+
39
+ model_1.2.bin ['samples: 303', 'precision: 0.6435643564356436', 'recall: 0.5342465753424658', 'F1: 0.5838323353293414']
40
+ model_1.4.bin ['samples: 303', 'precision: 0.7161716171617162', 'recall: 0.5945205479452055', 'F1: 0.6497005988023953']
41
+ model_1.5.bin ['samples: 303', 'precision: 0.6864686468646864', 'recall: 0.5698630136986301', 'F1: 0.6227544910179641']
42
+ model_1.6.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.6082191780821918', 'F1: 0.6646706586826348']
43
+ model_1.7.bin ['samples: 303', 'precision: 0.7425742574257426', 'recall: 0.6164383561643836', 'F1: 0.6736526946107784']
44
+ model_1.8.bin ['samples: 303', 'precision: 0.7392739273927392', 'recall: 0.6136986301369863', 'F1: 0.6706586826347306']
45
+ model_1.9.bin ['samples: 303', 'precision: 0.7722772277227723', 'recall: 0.6410958904109589', 'F1: 0.7005988023952096']
46
+ model_2.0.bin ['samples: 303', 'precision: 0.759075907590759', 'recall: 0.6301369863013698', 'F1: 0.688622754491018']
47
+ model_2.1.bin ['samples: 303', 'precision: 0.7623762376237624', 'recall: 0.6328767123287671', 'F1: 0.6916167664670658']
48
+ model_2.2.bin ['samples: 303', 'precision: 0.7458745874587459', 'recall: 0.6191780821917808', 'F1: 0.6766467065868264']
49
+
50
+ =================================================非重复验证集+5分标签==================================================
51
+
52
+ model_1.2.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
53
+ model_1.4.bin ['samples: 30', 'precision: 0.8666666666666667', 'recall: 0.8125', 'F1: 0.8387096774193549']
54
+ model_1.5.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
55
+ model_1.6.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
56
+ model_1.7.bin ['samples: 30', 'precision: 0.8', 'recall: 0.75', 'F1: 0.7741935483870969']
57
+ model_1.8.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
58
+ model_1.9.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
59
+ model_2.0.bin ['samples: 30', 'precision: 0.8333333333333334', 'recall: 0.78125', 'F1: 0.8064516129032259']
60
+ model_2.1.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
61
+ model_2.2.bin ['samples: 30', 'precision: 0.9', 'recall: 0.84375', 'F1: 0.870967741935484']
62
+
63
+
64
+ model_1.2.bin ['samples: 302', 'precision: 0.6721854304635762', 'recall: 0.6444444444444445', 'F1: 0.6580226904376014']
65
+ model_1.4.bin ['samples: 302', 'precision: 0.7019867549668874', 'recall: 0.6730158730158731', 'F1: 0.6871961102106969']
66
+ model_1.5.bin ['samples: 302', 'precision: 0.7185430463576159', 'recall: 0.6888888888888889', 'F1: 0.7034035656401946']
67
+ model_1.6.bin ['samples: 302', 'precision: 0.7086092715231788', 'recall: 0.6793650793650794', 'F1: 0.6936790923824959']
68
+ model_1.7.bin ['samples: 302', 'precision: 0.7052980132450332', 'recall: 0.6761904761904762', 'F1: 0.6904376012965965']
69
+ model_1.8.bin ['samples: 302', 'precision: 0.7317880794701986', 'recall: 0.7015873015873015', 'F1: 0.7163695299837927']
70
+ model_1.9.bin ['samples: 302', 'precision: 0.7317880794701986', 'recall: 0.7015873015873015', 'F1: 0.7163695299837927']
71
+ model_2.0.bin ['samples: 302', 'precision: 0.7417218543046358', 'recall: 0.7111111111111111', 'F1: 0.7260940032414911']
72
+ model_2.1.bin ['samples: 302', 'precision: 0.7516556291390728', 'recall: 0.7206349206349206', 'F1: 0.7358184764991895']
73
+ model_2.2.bin ['samples: 302', 'precision: 0.7582781456953642', 'recall: 0.726984126984127', 'F1: 0.7423014586709886']
74
+
75
+
76
+ model_1.2.bin ['samples: 303', 'precision: 0.6732673267326733', 'recall: 0.6455696202531646', 'F1: 0.6591276252019386']
77
+ model_1.4.bin ['samples: 303', 'precision: 0.7029702970297029', 'recall: 0.6740506329113924', 'F1: 0.6882067851373183']
78
+ model_1.5.bin ['samples: 303', 'precision: 0.7194719471947195', 'recall: 0.689873417721519', 'F1: 0.7043618739903069']
79
+ model_1.6.bin ['samples: 303', 'precision: 0.7095709570957096', 'recall: 0.680379746835443', 'F1: 0.6946688206785137']
80
+ model_1.7.bin ['samples: 303', 'precision: 0.7062706270627063', 'recall: 0.6772151898734177', 'F1: 0.6914378029079159']
81
+ model_1.8.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.7025316455696202', 'F1: 0.7172859450726979']
82
+ model_1.9.bin ['samples: 303', 'precision: 0.7326732673267327', 'recall: 0.7025316455696202', 'F1: 0.7172859450726979']
83
+ model_2.0.bin ['samples: 303', 'precision: 0.7425742574257426', 'recall: 0.7120253164556962', 'F1: 0.7269789983844911']
84
+ model_2.1.bin ['samples: 303', 'precision: 0.7524752475247525', 'recall: 0.7215189873417721', 'F1: 0.7366720516962842']
85
+ model_2.2.bin ['samples: 303', 'precision: 0.759075907590759', 'recall: 0.7278481012658228', 'F1: 0.7431340872374799']
86
+
87
+
88
+ model_1.2.bin ['samples: 425', 'precision: 0.6470588235294118', 'recall: 0.5456349206349206', 'F1: 0.5920344456404736']
89
+ model_1.2.bin ['samples: 425', 'precision: 0.691764705882353', 'recall: 0.6621621621621622', 'F1: 0.6766398158803222']
90
+ model_1.4.bin ['samples: 425', 'precision: 0.7129411764705882', 'recall: 0.6824324324324325', 'F1: 0.6973532796317606']
91
+ model_1.5.bin ['samples: 425', 'precision: 0.7294117647058823', 'recall: 0.6981981981981982', 'F1: 0.713463751438435']
92
+ model_1.6.bin ['samples: 425', 'precision: 0.7129411764705882', 'recall: 0.6824324324324325', 'F1: 0.6973532796317606']
93
+ model_1.7.bin ['samples: 425', 'precision: 0.7105882352941176', 'recall: 0.6801801801801802', 'F1: 0.6950517836593786']
94
+ model_1.8.bin ['samples: 425', 'precision: 0.7505882352941177', 'recall: 0.7184684684684685', 'F1: 0.7341772151898734']
95
+ model_1.9.bin ['samples: 425', 'precision: 0.7529411764705882', 'recall: 0.7207207207207207', 'F1: 0.7364787111622554']
96
+ model_2.0.bin ['samples: 425', 'precision: 0.7670588235294118', 'recall: 0.7342342342342343', 'F1: 0.7502876869965478']
97
+ model_2.1.bin ['samples: 425', 'precision: 0.7717647058823529', 'recall: 0.7387387387387387', 'F1: 0.7548906789413118']
98
+ model_2.2.bin ['samples: 425', 'precision: 0.7764705882352941', 'recall: 0.7432432432432432', 'F1: 0.7594936708860759']
99
+
100
+ model_1.2.bin ['samples: 447', 'precision: 0.6935123042505593', 'recall: 0.6623931623931624', 'F1: 0.6775956284153005']
101
+ model_1.4.bin ['samples: 447', 'precision: 0.7158836689038032', 'recall: 0.6837606837606838', 'F1: 0.6994535519125684']
102
+ model_1.5.bin ['samples: 447', 'precision: 0.7337807606263982', 'recall: 0.7008547008547008', 'F1: 0.7169398907103826']
103
+ model_1.6.bin ['samples: 447', 'precision: 0.7203579418344519', 'recall: 0.688034188034188', 'F1: 0.7038251366120218']
104
+ model_1.7.bin ['samples: 447', 'precision: 0.7158836689038032', 'recall: 0.6837606837606838', 'F1: 0.6994535519125684']
105
+ model_1.8.bin ['samples: 447', 'precision: 0.7539149888143176', 'recall: 0.7200854700854701', 'F1: 0.7366120218579234']
106
+ model_1.9.bin ['samples: 447', 'precision: 0.7539149888143176', 'recall: 0.7200854700854701', 'F1: 0.7366120218579234']
107
+ model_2.0.bin ['samples: 447', 'precision: 0.7695749440715883', 'recall: 0.7350427350427351', 'F1: 0.7519125683060108']
108
+ model_2.1.bin ['samples: 447', 'precision: 0.7718120805369127', 'recall: 0.7371794871794872', 'F1: 0.7540983606557377']
109
+ model_2.2.bin ['samples: 447', 'precision: 0.7785234899328859', 'recall: 0.7435897435897436', 'F1: 0.760655737704918']
110
+
111
+ model_1.2.bin
112
+ model_1.4.bin
113
+ model_1.5.bin
114
+ model_1.6.bin
115
+ model_1.7.bin
116
+ model_1.8.bin
117
+ model_1.9.bin
118
+ model_2.0.bin
119
+ model_2.1.bin
120
+ model_2.2.bin
121
+
122
+ model_1.2.bin
123
+ model_1.4.bin
124
+ model_1.5.bin
125
+ model_1.6.bin
126
+ model_1.7.bin
127
+ model_1.8.bin
128
+ model_1.9.bin
129
+ model_2.0.bin
130
+ model_2.1.bin
131
+ model_2.2.bin
132
+
133
+
134
+
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.1
2
+ Name: PgsFile
3
+ Version: 0.2.4
4
+ Summary: This module streamlines Python package management, script execution, file handling, web scraping, multimedia downloads, data cleaning, and NLP tasks such as word tokenization and POS tagging. It also assists with generating word lists and plotting data, making these tasks more accessible and convenient for literary students. Whether you need to scrape data from websites, clean text, or analyze language, this module provides user-friendly tools to simplify your workflow.
5
+ Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
6
+ Author: Pan Guisheng
7
+ Author-email: 895284504@qq.com
8
+ License: Educational free
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: Free For Educational Use
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: chardet
16
+ Requires-Dist: pandas
17
+ Requires-Dist: python-docx
18
+ Requires-Dist: pip
19
+ Requires-Dist: requests
20
+ Requires-Dist: fake-useragent
21
+ Requires-Dist: lxml
22
+ Requires-Dist: pimht
23
+ Requires-Dist: pysbd
24
+ Requires-Dist: nlpir-python
25
+ Requires-Dist: pillow
26
+
27
+ Purpose: This module is designed to make complex tasks accessible and convenient, even for beginners. By providing a unified set of tools, it simplifies the workflow for data collection, processing, and analysis. Whether you're scraping data from the web, cleaning text, or performing NLP tasks, this module ensures you can focus on your research without getting bogged down by technical challenges.
28
+
29
+ Key Features:
30
+ 1. Web Scraping: Easily scrape data from websites and download multimedia content.
31
+ 2. Package Management: Install, uninstall, and manage Python packages with simple commands.
32
+ 3. Data Retrieval: Extract data from various file formats like text, JSON, TSV, Excel, and HTML (both online and offline).
33
+ 4. Data Storage: Write and append data to text files, Excel, JSON, and JSON lines.
34
+ 5. File and Folder Processing: Manage file paths, create directories, move or copy files, and search for files with specific keywords.
35
+ 6. Data Cleaning: Clean text, handle punctuation, remove stopwords, and prepare data for analysis.
36
+ 7. NLP: Perform tokenization, generate n-grams, and create word lists for text analysis.
37
+ 8. Math Operations: Format numbers, convert decimals to percentages, and validate data.
38
+ 9. Visualization: Process images (e.g., make white pixels transparent) and manage fonts for rendering text.
39
+
40
+ Author: Pan Guisheng, a PhD student at the Graduate Institute of Interpretation and Translation of Shanghai International Studies University
41
+ E-mail: 895284504@qq.com
@@ -1,5 +1,5 @@
1
- PgsFile/PgsFile.py,sha256=MpXQK6MLMBh1JMAcBw5sRiRof--x4OyARcCsWwn7Z4A,85828
2
- PgsFile/__init__.py,sha256=E4VfPu1BxCBcZ5WXi5E6faPaNt_Shpvgh9LvBlg7eA0,2389
1
+ PgsFile/PgsFile.py,sha256=V_Pnn5hljeR9xYQ8hyUAmf92140N4ORoQOe-cdBHJos,101212
2
+ PgsFile/__init__.py,sha256=C-uX4tN3J3L5Zr6r8qQx0zwNaG0UXTowxK_K0pd0Jt4,2680
3
3
  PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
4
4
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
5
5
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2600,6 +2600,7 @@ PgsFile/Corpora/Stopwords/turkish.txt,sha256=uGUvjEm2GR8PuVY_JeHNxhD7cWlNlF7vc3V
2600
2600
  PgsFile/Corpora/Stopwords/ukrainian.txt,sha256=fEzWLTwnWJriILkO-5jSfE2SpqY-GPf_kR4zid3MFUI,4131
2601
2601
  PgsFile/Corpora/Stopwords/vietnamese.txt,sha256=88yRtVMaRSFqas1iGGa6kOGDCZTgtzRPmR3q9dHshdc,20485
2602
2602
  PgsFile/Corpora/Terminology/Chinese_Thought.json,sha256=CdkuF2wLaDC5V3sRefcU1RZwXm4-wTZ-Qfk8r7gsu8I,2301866
2603
+ PgsFile/models/NLPIR.user,sha256=DykLJdr8_cVHrdCnDJES1O5dgmnYqfaSO1_dtAVKYJk,3356
2603
2604
  PgsFile/models/czech.pickle,sha256=W6c9KTx9eVOVa88C82lexcHw1Sfyo8OAl_VZM5T6FpA,1265552
2604
2605
  PgsFile/models/danish.pickle,sha256=6il2CgqRl_UspZ54rq_FpvVdBSWPr32xcJsrnrMh7yA,1264725
2605
2606
  PgsFile/models/dutch.pickle,sha256=So4ms9aMRcOOWU0Z4tVndEe_3KpjbTsees_tDpJy1zw,742624
@@ -2611,6 +2612,8 @@ PgsFile/models/german.pickle,sha256=6rSX-ghUExMMj9D7E7kpEokwr-L2om6ocVyV33CI6Xw,
2611
2612
  PgsFile/models/greek.pickle,sha256=IXUqZ2L61c_kb7XEX62ahUhKDo6Bxn5q9vuXPPwn1nw,1953106
2612
2613
  PgsFile/models/italian.pickle,sha256=3LJxfXvl8m6GCpLgWs9psRI6X0UnzXommpq56eZoyAU,658331
2613
2614
  PgsFile/models/malayalam.pickle,sha256=H4z1isvbf0cqxAr_wTZjvkLa-0fBUDDBGt4ERMng5T0,221207
2615
+ PgsFile/models/model_reviews2.2.bin,sha256=D6uL8KZIxD0rfWjH0kYEb7z_HE4aTJXpj82HzsCOpuk,1943196
2616
+ PgsFile/models/model_reviews_ReadMe.txt,sha256=Q9uLJwudMmsTKfd11l1tOcIP8lwsemIwnAVJG_3SYjU,11433
2614
2617
  PgsFile/models/norwegian.pickle,sha256=5Kl_j5oDoDON10a8yJoK4PVK5DuDX6N9g-J54cp5T68,1259779
2615
2618
  PgsFile/models/polish.pickle,sha256=FhJ7bRCTNCej6Q-yDpvlPh-zcf95pzDBAwc07YC5DJI,2042451
2616
2619
  PgsFile/models/portuguese.pickle,sha256=uwG_fHmk6twheLvSCWZROaDks48tHET-8Jfek5VRQOA,649051
@@ -2619,8 +2622,14 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
2619
2622
  PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
2620
2623
  PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
2621
2624
  PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
2622
- PgsFile-0.2.3.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2623
- PgsFile-0.2.3.dist-info/METADATA,sha256=a9KMN6LpC2raZYhWwrFhWCXKl7nWneiXT7KtvA74ruY,5070
2624
- PgsFile-0.2.3.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2625
- PgsFile-0.2.3.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2626
- PgsFile-0.2.3.dist-info/RECORD,,
2625
+ PgsFile/models/fonts/DejaVuSans.ttf,sha256=faGVp0xVvvmI0NSPlQi9XYSUJcF3Dbpde_xs6e2EiVQ,757076
2626
+ PgsFile/models/fonts/书体坊赵九江钢笔行书体.ttf,sha256=fTOv4FFMnYtN1zCZghJ6-P1pzznA5qqoujwpDFY63Ek,3140656
2627
+ PgsFile/models/fonts/全新硬笔楷书简.ttf,sha256=mPemGYMpgQxvFL1pFjjnyUMIprHzcoOaw8oeZQ4k1x0,2397296
2628
+ PgsFile/models/fonts/全新硬笔行书简.ttf,sha256=bUtbl71eK_ellp1z0tCmmR_P-JhqVFIpzeuRlrEBo9g,2611516
2629
+ PgsFile/models/fonts/博洋行书3500.TTF,sha256=VrgeHr8cgOL6JD05QyuD9ZSyw4J2aIVxKxW8zSajq6Q,4410732
2630
+ PgsFile/models/fonts/陆柬之行书字体.ttf,sha256=Zpd4Z7E9w-Qy74yklXHk4vM7HOtHuQgllvygxZZ1Hvs,1247288
2631
+ PgsFile-0.2.4.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2632
+ PgsFile-0.2.4.dist-info/METADATA,sha256=JC1a8Xrh3tDt5-HNnCJY3V4tNYKstu83V2qo_FqkATY,2711
2633
+ PgsFile-0.2.4.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2634
+ PgsFile-0.2.4.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2635
+ PgsFile-0.2.4.dist-info/RECORD,,
@@ -1,79 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: PgsFile
3
- Version: 0.2.3
4
- Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, NLP tasks like Chinese word tokenization and POS tagging, and word list generation for literary students, making it more accessible and convenient to use.
5
- Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
6
- Author: Pan Guisheng
7
- Author-email: 895284504@qq.com
8
- License: Educational free
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: Free For Educational Use
11
- Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.8
13
- Description-Content-Type: text/markdown
14
- License-File: LICENSE
15
- Requires-Dist: chardet
16
- Requires-Dist: pandas
17
- Requires-Dist: python-docx
18
- Requires-Dist: pip
19
- Requires-Dist: requests
20
- Requires-Dist: fake-useragent
21
- Requires-Dist: lxml
22
- Requires-Dist: pimht
23
- Requires-Dist: pysbd
24
- Requires-Dist: nlpir-python
25
-
26
- Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
27
-
28
-
29
- Function 1: Enables efficient data retrieval and storage in files with a single line of code.
30
-
31
- Function 2: Facilitates retrieval of all absolute file paths and file names in any folder (including sub-folders) with a single line of code using "FilePath" and "FileName" functions.
32
-
33
- Function 3: Simplifies creation of word lists and frequency sorting from a file or batch of files using "word_list" and "batch_word_list" functions in PgsFile.
34
-
35
- Function 4: Pgs-Corpora is a comprehensive language resource included in this library, featuring a monolingual corpus of native and translational Chinese and native and non-native English, as well as a bi-directional parallel corpus of Chinese and English texts covering financial, legal, political, academic, and sports news topics. Additionally, the library includes a collection of 8774 English idioms, stopwords for 28 languages, and a termbank of Chinese thought and culture.
36
-
37
- Function 5: This library provides support for common text cleaning tasks, such as removing empty text, empty lines, and folders containing empty text. It also offers functions for converting full-width characters to half-width characters and vice versa, as well as standardizing the format of Chinese and English punctuation. These features can help improve the quality and consistency of text data used in natural language processing tasks.
38
-
39
- Function 6: It also manages Python package installations and uninstallations, and allows running scripts and commands in Python interactive command lines instead of Windows command prompt.
40
-
41
- Function 7: Download audiovisual files like videos, images, and audio using audiovisual_downloader, which is extremely useful and efficient. Additionally, scrape newspaper data with PGScraper, a highly efficient tool for this purpose.
42
-
43
- Table 1: The directory and size of Pgs-Corpora
44
- ├── Idioms (1, 171.78 KB)
45
- ├── Monolingual (2197, 63.65 MB)
46
- │ ├── Chinese (456, 15.27 MB)
47
- │ │ ├── People's Daily 20130605 (396, 1.38 MB)
48
- │ │ │ ├── Raw (132, 261.73 KB)
49
- │ │ │ ├── Seg_only (132, 471.47 KB)
50
- │ │ │ └── Tagged (132, 675.30 KB)
51
- │ │ └── Translational Fictions (60, 13.89 MB)
52
- │ └── English (1741, 48.38 MB)
53
- │ ├── Native (65, 44.14 MB)
54
- │ │ ├── A Short Collection of British Fiction (27, 33.90 MB)
55
- │ │ └── Preschoolers- and Teenagers-oriented Texts in English (36, 10.24 MB)
56
- │ ├── Non-native (1675, 3.63 MB)
57
- │ │ └── Shanghai Daily (1675, 3.63 MB)
58
- │ │ └── Business_2019 (1675, 3.63 MB)
59
- │ │ ├── 2019-01-01 (1, 3.35 KB)
60
- │ │ ├── 2019-01-02 (1, 3.65 KB)
61
- │ │ ├── 2019-01-03 (7, 10.90 KB)
62
- │ │ ├── 2019-01-04 (5, 9.63 KB)
63
- │ │ └── 2019-01-07 (4, 9.50 KB)
64
- │ │ └── ... (and 245 more directories)
65
- │ └── Translational (1, 622.57 KB)
66
- ├── Parallel (371, 24.67 MB)
67
- │ ├── HK Financial and Legal EC Parallel Corpora (5, 19.17 MB)
68
- │ ├── New Year Address_CE_2006-2021 (15, 147.49 KB)
69
- │ ├── Sports News_CE_2010 (20, 66.42 KB)
70
- │ ├── TED_EC_2017-2020 (330, 5.24 MB)
71
- │ └── Xi's Speech_CE_2021 (1, 53.01 KB)
72
- ├── Stopwords (28, 88.09 KB)
73
- └── Terminology (1, 2.20 MB)
74
-
75
- ...
76
-
77
-
78
- Author: Pan Guisheng, a PhD student at the Graduate Institute of Interpretation and Translation of Shanghai International Studies University
79
- E-mail: 895284504@qq.com