PyPI - PgsFile - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

PgsFile 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

PgsFile/PgsFile.py CHANGED Viewed

@@ -68,7 +68,7 @@ def get_data_text(path):
     ----------
     path : TYPE string
         DESCRIPTION.
-    		Using path to get data from a single txt file. eg. D:\raw_text.txt
+    		Using path to get data from a single txt file. eg. raw_text.txt
             Theoretically, it supports all the text encoding formats, like utf-8, unicode, ansi, gbk etc.
     Returns
@@ -95,7 +95,7 @@ def get_data_lines(path):
     ----------
     path : TYPE string
         DESCRIPTION.
-    		Using path to get data from a single txt file. eg. D:\raw_text.txt
+    		Using path to get data from a single txt file. eg. raw_text.txt
     		Theoretically, it supports all the text encoding formats, like utf-8, unicode, ansi, gbk etc.
     Returns
@@ -152,7 +152,7 @@ def get_data_excel(excel_path,column_id,sheet_name=None):
     Parameters
     ----------
     excel_path : TYPE
-        DESCRIPTION. D:\data_python.xlsx
+        DESCRIPTION. data_python.xlsx
     column_id : TYPE Int 0,1,2,3
         DESCRIPTION. 0 means the first column, 1 means the second.
@@ -180,7 +180,7 @@ def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
     Parameters
     ----------
     excel_path : TYPE
-        DESCRIPTION. D:\results.xlsx
+        DESCRIPTION. results.xlsx
     dic_of_list : TYPE
         DESCRIPTION. {"col":["a","b","c","d"],"freq":[1,2,3,4]}
@@ -233,7 +233,7 @@ def get_tsv_lines(csv_path, delimiter=None):
     '''
     Parameters
     ----------
-    get_tsv_lines : TYPE D:\data.tsv
+    get_tsv_lines : TYPE data.tsv
         DESCRIPTION.
     Returns
@@ -261,7 +261,7 @@ def get_data_json(json_path):
     '''
     Parameters
     ----------
-    json_path : TYPE D:\data.json
+    json_path : TYPE data.json
         DESCRIPTION.
     Returns
@@ -285,7 +285,7 @@ def get_json_lines(json_path):
     '''
     Parameters
     ----------
-    json_path : TYPE D:\data.json
+    json_path : TYPE data.json
         DESCRIPTION.
     Returns
@@ -308,7 +308,7 @@ def write_to_json(json_path,my_dic):
     Parameters
     ----------
     json_path : TYPE string
-        DESCRIPTION. D:\data.json
+        DESCRIPTION. data.json
     my_dic : TYPE dict or list
         DESCRIPTION.
@@ -332,7 +332,7 @@ def write_to_json_lines(json_path,my_json_data):
     Parameters
     ----------
     json_path : TYPE string
-        DESCRIPTION. D:\data.json
+        DESCRIPTION. data.json
     my_json_data : TYPE dict or list
         DESCRIPTION.
@@ -449,7 +449,7 @@ def get_subfolder_path(parent_folder, subfolder_name):
 BigPunctuation="""!"#$&\'()*+,-/:;<=>?@[\\]^_`{|}.%~＂＃＄％＆＇?。（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。``''"""   #除去英文标点.%
 StopTags="""◆: 、/ 。/ ---/ -/ --/ -- :/ ;/ ?/ ??/ ?┖ @/ [/ ]/ ^/ ‘/ ’/ "/ "/ 〈/ 〉/ 《/ 》/ 【/ 】/ >/ ∶/ ■/ ●/ ·/ …/ !/ #/ %,/ %/ \'/ (/ )/ */ +/ ,/ -/ // np v n w m a x t q j ni ns d i f u p g nz c r id s k h o e / #?/ --/""" #用来停用词性标注
 Special="""∶ ■ ● ① ② ③ × ℃ Ⅲ ④ ⑤ ◆ ⑥ ± ⑦ ⑧ → ⑨ ▲ ⑩ ─ ÷ μ γ β Ⅱ Ⅰ ‰ □ 〇 ○ Ⅴ Ⅳ ★ ﹐ ° ※ ︰ α ― ≠ █ о θ ω ⒈ ⒉ ⒊ н ≤ ì ǎ ≥ р т с к й а и Ⅵ é è ﹢ ﹝ ﹞  ā ⒋ ù π ◇ Ω Ф ы Я п К в у м ǒ ü á ǔ ⒌ ⒍ 䦆 Ⅹ Ⅶ ← """
-ZhStopWords="""——— 》）， ）÷（１－ ”， ）、 ＝（ : → ℃  & * 一一 ~~~~ ’ .  『 .一 ./ --  』 ＝″ 【 ［＊］ ｝＞ ［⑤］］ ［①Ｄ］ ｃ］ ｎｇ昉 ＊ // ［ ］ ［②ｅ］ ［②ｇ］ ＝｛ } ，也  ‘ Ａ ［①⑥］ ［②Ｂ］  ［①ａ］ ［④ａ］ ［①③］ ［③ｈ］ ③］ １．  －－  ［②ｂ］ ’‘  ×××  ［①⑧］ ０：２  ＝［ ［⑤ｂ］ ［②ｃ］  ［④ｂ］ ［②③］ ［③ａ］ ［④ｃ］ ［①⑤］ ［①⑦］ ［①ｇ］ ∈［  ［①⑨］ ［①④］ ［①ｃ］ ［②ｆ］ ［②⑧］ ［②①］ ［①Ｃ］ ［③ｃ］ ［③ｇ］ ［②⑤］ ［②②］ 一. ［①ｈ］ .数 ［］ ［①Ｂ］ 数/ ［①ｉ］ ［③ｅ］ ［①①］ ［④ｄ］ ［④ｅ］ ［③ｂ］ ［⑤ａ］ ［①Ａ］ ［②⑧］ ［②⑦］ ［①ｄ］ ［②ｊ］ 〕〔 ］［ :// ′∈ ［②④ ［⑤ｅ］ １２％ ｂ］ ... ................... …………………………………………………③ ＺＸＦＩＴＬ ［③Ｆ］ 」 ［①ｏ］ ］∧′＝［  ∪φ∈ ′｜ ｛－ ②ｃ ｝ ［③①］ Ｒ．Ｌ． ［①Ｅ］ Ψ －［＊］－ ↑ .日  ［②ｄ］ ［② ［②⑦］ ［②②］ ［③ｅ］ ［①ｉ］ ［①Ｂ］ ［①ｈ］ ［①ｄ］ ［①ｇ］ ［①②］ ［②ａ］ ｆ］ ［⑩］ ａ］ ［①ｅ］ ［②ｈ］ ［②⑥］ ［③ｄ］ ［②⑩］ ｅ］ 〉 】 元／吨 ［②⑩］ ２．３％ ５：０   ［①］ :: ［②］ ［③］ ［④］ ［⑤］ ［⑥］ ［⑦］ ［⑧］ ［⑨］  …… —— ? 、 。 “ ” 《 》 ！ ， ： ； ？ ． , ． ' ?  · ——— ── ?  — < > （ ） 〔 〕 [ ] ( ) - + ～ × ／ / ① ② ③ ④ ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ Ⅲ В " ; # @ γ μ φ φ． ×  Δ ■ ▲ sub exp  sup sub Lex  ＃ ％ ＆ ＇ ＋ ＋ξ ＋＋ － －β ＜ ＜± ＜Δ ＜λ ＜φ ＜＜ = ＝ ＝☆ ＝－ ＞ ＞λ ＿ ～± ～＋ ［⑤ｆ］ ［⑤ｄ］ ［②ｉ］ ≈  ［②Ｇ］ ［①ｆ］ ＬＩ ㈧  ［－ ...... 〉 ［③⑩］ 第二 一番 一直 一个 一些 许多 种 有的是 也就是说 末##末 啊 阿 哎 哎呀 哎哟 唉 俺 俺们 按 按照 吧 吧哒 把 罢了 被 本 本着 比 比方 比如 鄙人 彼 彼此 边 别 别的 别说 并 并且 不比 不成 不单 不但 不独 不管 不光 不过 不仅 不拘 不论 不怕 不然 不如 不特 不惟 不问 不只 朝 朝着 趁 趁着 乘 冲 除 除此之外 除非 除了 此 此间 此外 从 从而 打 待 但 但是 当 当着 到 得 的 的话 等 等等 地 第 叮咚 对 对于 多 多少 而 而况 而且 而是 而外 而言 而已 尔后 反过来 反过来说 反之 非但 非徒 否则 嘎 嘎登 该 赶 个 各 各个 各位 各种 各自 给 根据 跟 故 故此 固然 关于 管 归 果然 果真 过 哈 哈哈 呵 和 何 何处 何况 何时 嘿 哼 哼唷 呼哧 乎 哗 还是 还有 换句话说 换言之 或 或是 或者 极了 及 及其 及至 即 即便 即或 即令 即若 即使 几 几时 己 既 既然 既是 继而 加之 假如 假若 假使 鉴于 将 较 较之 叫 接着 结果 借 紧接着 进而 尽 尽管 经 经过 就 就是 就是说 据 具体地说 具体说来 开始 开外 靠 咳 可 可见 可是 可以 况且 啦 来 来着 离 例如 哩 连 连同 两者 了 临 另 另外 另一方面 论 嘛 吗 慢说 漫说 冒 么 每 每当 们 莫若 某 某个 某些 拿 哪 哪边 哪儿 哪个 哪里 哪年 哪怕 哪天 哪些 哪样 那 那边 那儿 那个 那会儿 那里 那么 那么些 那么样 那时 那些 那样 乃 乃至 呢 能 你 你们 您 宁 宁可 宁肯 宁愿 哦 呕 啪达 旁人 呸 凭 凭借 其 其次 其二 其他 其它 其一 其余 其中 起 起见 起见 岂但 恰恰相反 前后 前者 且 然而 然后 然则 让 人家 任 任何 任凭 如 如此 如果 如何 如其 如若 如上所述 若 若非 若是 啥 上下 尚且 设若 设使 甚而 甚么 甚至 省得 时候 什么 什么样 使得 是 是的 首先 谁 谁知 顺 顺着 似的 虽 虽然 虽说 虽则 随 随着 所 所以 他 他们 他人 它 它们 她 她们 倘 倘或 倘然 倘若 倘使 腾 替 通过 同 同时 哇 万一 往 望 为 为何 为了 为什么 为着 喂 嗡嗡 我 我们 呜 呜呼 乌乎 无论 无宁 毋宁 嘻 吓 相对而言 像 向 向着 嘘 呀 焉 沿 沿着 要 要不 要不然 要不是 要么 要是 也 也罢 也好 一 一般 一旦 一方面 一来 一切 一样 一则 依 依照 矣 以 以便 以及 以免 以至 以至于 以致 抑或 因 因此 因而 因为 哟 用 由 由此可见 由于 有 有的 有关 有些 又 于 于是 于是乎 与 与此同时 与否 与其 越是 云云 哉 再说 再者 在 在下 咱 咱们 则 怎 怎么 怎么办 怎么样 怎样 咋 照 照着 者 这 这边 这儿 这个 这会儿 这就是说 这里 这么 这么点儿 这么些 这么样 这时 这些 这样 正如 吱 之 之类 之所以 之一 只是 只限 只要 只有 至 至于 诸位 着 着呢 自 自从 自个儿 自各儿 自己 自家 自身 综上所述 总的来看 总的来说 总的说来 总而言之 总之 纵 纵令 纵然 纵使 遵照 作为 兮 呃 呗 咚 咦 喏 啐 喔唷 嗬 嗯 嗳"""
+ZhStopWords="""——— 》）， ）÷（１－ ”， ）、 ＝（ : → ℃  & * 一一 ~~~~ ’ .  『 .一 ./ --  』 ＝″ 【 ［＊］ ｝＞ ［⑤］］ ［①Ｄ］ ｃ］ ｎｇ昉 ＊ // ［ ］ ［②ｅ］ ［②ｇ］ ＝｛ } ，也  ‘ Ａ ［①⑥］ ［②Ｂ］  ［①ａ］ ［④ａ］ ［①③］ ［③ｈ］ ③］ １．  －－  ［②ｂ］ ’‘  ×××  ［①⑧］ ０：２  ＝［ ［⑤ｂ］ ［②ｃ］  ［④ｂ］ ［②③］ ［③ａ］ ［④ｃ］ ［①⑤］ ［①⑦］ ［①ｇ］ ∈［  ［①⑨］ ［①④］ ［①ｃ］ ［②ｆ］ ［②⑧］ ［②①］ ［①Ｃ］ ［③ｃ］ ［③ｇ］ ［②⑤］ ［②②］ 一. ［①ｈ］ .数 ［］ ［①Ｂ］ 数/ ［①ｉ］ ［③ｅ］ ［①①］ ［④ｄ］ ［④ｅ］ ［③ｂ］ ［⑤ａ］ ［①Ａ］ ［②⑧］ ［②⑦］ ［①ｄ］ ［②ｊ］ 〕〔 ］［ :// ′∈ ［②④ ［⑤ｅ］ １２％ ｂ］ ... ................... …………………………………………………③ ＺＸＦＩＴＬ ［③Ｆ］ 」 ［①ｏ］ ］∧′＝［  ∪φ∈ ′｜ ｛－ ②ｃ ｝ ［③①］ Ｒ．Ｌ． ［①Ｅ］ Ψ －［＊］－ ↑ .日  ［②ｄ］ ［② ［②⑦］ ［②②］ ［③ｅ］ ［①ｉ］ ［①Ｂ］ ［①ｈ］ ［①ｄ］ ［①ｇ］ ［①②］ ［②ａ］ ｆ］ ［⑩］ ａ］ ［①ｅ］ ［②ｈ］ ［②⑥］ ［③ｄ］ ［②⑩］ ｅ］ 〉 】 元／吨 ［②⑩］ ２．３％ ５：０   ［①］ :: ［②］ ［③］ ［④］ ［⑤］ ［⑥］ ［⑦］ ［⑧］ ［⑨］  …… —— ? 、 。 “ ” 《 》 ！ ， ： ； ？ ． , ． ' ?  · ——— ── ?  — < > （ ） 〔 〕 [ ] ( ) - + ～ × ／ / ① ② ③ ④ ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ Ⅲ В " ; # @ γ μ φ φ． ×  Δ ■ ▲ sub exp  sup sub Lex  ＃ ％ ＆ ＇ ＋ ＋ξ ＋＋ － －β ＜ ＜± ＜Δ ＜λ ＜φ ＜＜=＝ ＝☆ ＝－ ＞ ＞λ ＿ ～± ～＋ ［⑤ｆ］ ［⑤ｄ］ ［②ｉ］ ≈  ［②Ｇ］ ［①ｆ］ ＬＩ ㈧  ［－ ...... 〉 ［③⑩］ 第二 一番 一直 一个 一些 许多 种 有的是 也就是说 末##末 啊 阿 哎 哎呀 哎哟 唉 俺 俺们 按 按照 吧 吧哒 把 罢了 被 本 本着 比 比方 比如 鄙人 彼 彼此 边 别 别的 别说 并 并且 不比 不成 不单 不但 不独 不管 不光 不过 不仅 不拘 不论 不怕 不然 不如 不特 不惟 不问 不只 朝 朝着 趁 趁着 乘 冲 除 除此之外 除非 除了 此 此间 此外 从 从而 打 待 但 但是 当 当着 到 得 的 的话 等 等等 地 第 叮咚 对 对于 多 多少 而 而况 而且 而是 而外 而言 而已 尔后 反过来 反过来说 反之 非但 非徒 否则 嘎 嘎登 该 赶 个 各 各个 各位 各种 各自 给 根据 跟 故 故此 固然 关于 管 归 果然 果真 过 哈 哈哈 呵 和 何 何处 何况 何时 嘿 哼 哼唷 呼哧 乎 哗 还是 还有 换句话说 换言之 或 或是 或者 极了 及 及其 及至 即 即便 即或 即令 即若 即使 几 几时 己 既 既然 既是 继而 加之 假如 假若 假使 鉴于 将 较 较之 叫 接着 结果 借 紧接着 进而 尽 尽管 经 经过 就 就是 就是说 据 具体地说 具体说来 开始 开外 靠 咳 可 可见 可是 可以 况且 啦 来 来着 离 例如 哩 连 连同 两者 了 临 另 另外 另一方面 论 嘛 吗 慢说 漫说 冒 么 每 每当 们 莫若 某 某个 某些 拿 哪 哪边 哪儿 哪个 哪里 哪年 哪怕 哪天 哪些 哪样 那 那边 那儿 那个 那会儿 那里 那么 那么些 那么样 那时 那些 那样 乃 乃至 呢 能 你 你们 您 宁 宁可 宁肯 宁愿 哦 呕 啪达 旁人 呸 凭 凭借 其 其次 其二 其他 其它 其一 其余 其中 起 起见 起见 岂但 恰恰相反 前后 前者 且 然而 然后 然则 让 人家 任 任何 任凭 如 如此 如果 如何 如其 如若 如上所述 若 若非 若是 啥 上下 尚且 设若 设使 甚而 甚么 甚至 省得 时候 什么 什么样 使得 是 是的 首先 谁 谁知 顺 顺着 似的 虽 虽然 虽说 虽则 随 随着 所 所以 他 他们 他人 它 它们 她 她们 倘 倘或 倘然 倘若 倘使 腾 替 通过 同 同时 哇 万一 往 望 为 为何 为了 为什么 为着 喂 嗡嗡 我 我们 呜 呜呼 乌乎 无论 无宁 毋宁 嘻 吓 相对而言 像 向 向着 嘘 呀 焉 沿 沿着 要 要不 要不然 要不是 要么 要是 也 也罢 也好 一 一般 一旦 一方面 一来 一切 一样 一则 依 依照 矣 以 以便 以及 以免 以至 以至于 以致 抑或 因 因此 因而 因为 哟 用 由 由此可见 由于 有 有的 有关 有些 又 于 于是 于是乎 与 与此同时 与否 与其 越是 云云 哉 再说 再者 在 在下 咱 咱们 则 怎 怎么 怎么办 怎么样 怎样 咋 照 照着 者 这 这边 这儿 这个 这会儿 这就是说 这里 这么 这么点儿 这么些 这么样 这时 这些 这样 正如 吱 之 之类 之所以 之一 只是 只限 只要 只有 至 至于 诸位 着 着呢 自 自从 自个儿 自各儿 自己 自家 自身 综上所述 总的来看 总的来说 总的说来 总而言之 总之 纵 纵令 纵然 纵使 遵照 作为 兮 呃 呗 咚 咦 喏 啐 喔唷 嗬 嗯 嗳"""
 EnPunctuation="""!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~"""
 nltk_en_tags={'CC': '并列连词', 'CD': '基数词', 'DT': '限定符', 'EX': '存在词', 'FW': '外来词', 'IN': '介词或从属连词', 'JJ': '形容词', 'JJR': '比较级的形容词', 'JJS': '最高级的形容词', 'LS': '列表项标记', 'MD': '情态动词', 'NN': '名词单数', 'NNS': '名词复数', 'NNP': '专有名词', 'NNPS': '专有名词复数', 'PDT': '前置限定词', 'POS': '所有格结尾', 'PRP': '人称代词', 'PRP$': '所有格代词', 'RB': '副词', 'RBR': '副词比较级', 'RBS': '副词最高级', 'RP': '小品词', 'SYM': '符号', 'UH': '感叹词', 'VB': '动词原型', 'VBD': '动词过去式', 'VBG': '动名词或现在分词', 'VBN': '动词过去分词', 'VBP': '非第三人称单数的现在时', 'VBZ': '第三人称单数的现在时', 'WDT': '以wh开头的限定词', 'WP': '以wh开头的代词', 'WP$': '以wh开头的所有格代词', 'WRB': '以wh开头的副词', 'TO': 'to'}
 nltk_tag_mapping={'NN': 'Noun', 'NNS': 'Noun', 'NNP': 'Noun', 'NNPS': 'Noun', 'VB': 'Verb', 'VBD': 'Verb', 'VBG': 'Verb', 'VBN': 'Verb', 'VBP': 'Verb', 'VBZ': 'Verb', 'JJ': 'Adjective', 'JJR': 'Adjective', 'JJS': 'Adjective', 'RB': 'Adverb', 'RBR': 'Adverb', 'RBS': 'Adverb', 'IN': 'Preposition', 'PRP': 'Pronoun', 'PRP$': 'Pronoun', 'DT': 'Determiner', 'CC': 'Conjunction', 'CD': 'Numeral', 'UH': 'Interjection', 'FW': 'Foreign Word', 'TO': 'Particle', 'EX': 'Existential "there"', 'MD': 'Modal Auxiliary', 'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb', 'SYM': 'Symbol', 'RP': 'Particle', 'POS': 'Possessive ending', 'PDT': 'Predeterminer', 'LS': 'List item marker', 'NIL': 'Missing tag'}
@@ -457,6 +457,8 @@ nltk_tag_mapping={'NN': 'Noun', 'NNS': 'Noun', 'NNP': 'Noun', 'NNPS': 'Noun', 'V
 ICTCLAS2008={'a': '形容词', 'ad': '副形词', 'ag': '形容词性语素', 'al': '形容词性惯用语', 'an': '名形词', 'b': '区别词', 'bl': '区别词性惯用语', 'c': '连词', 'cc': '并列连词', 'd': '副词', 'dg': '副词性语素', 'dl': '副词性惯用语', 'e': '叹词', 'ew': '句末标点', 'f': '方位词', 'h': '前缀', 'k': '后缀', 'm': '数词', 'mg': '数词性语素', 'mq': '数量词', 'n': '名词', 'ng': '名词性语素', 'nl': '名词性惯用语', 'nr': '汉语人名', 'nr1': '汉语姓氏', 'nr2': '汉语名字', 'nrf': '音译人名', 'nrj': '日语人名', 'ns': '地名', 'nsf': '音译地名', 'nt': '机构团体名', 'nz': '其他专名', 'o': '拟声词', 'p': '介词', 'pba': '介词“把”', 'pbei': '介词“被”', 'q': '量词', 'qt': '时量词', 'qv': '动量词', 'r': '代词', 'rg': '代词性语素', 'rr': '人称代词', 'ry': '疑问代词', 'rys': '处所疑问代词', 'ryt': '时间疑问代词', 'ryv': '谓词性疑问代词', 'rz': '指示代词', 'rzs': '处所指示代词', 'rzt': '时间指示代词', 'rzv': '谓词性指示代词', 's': '处所词', 't': '时间词', 'tg': '时间词性语素', 'u': '助词', 'udel': '的、底', 'ude2': '地', 'ude3': '得', 'udeng': '等、等等、云云', 'udh': '......的话', 'uguo': '过', 'ule': '了', 'ulian': '连', 'uls': '来讲、来说；而言、说来', 'usuo': '所', 'uyy': '一样、一般；似的、般', 'uzhe': '着', 'uzhi': '之', 'v': '动词', 'vd': '副动词', 'vf': '趋向动词', 'vg': '动词性语素', 'vi': '不及物动词', 'vl': '动词性惯用语', 'vn': '名动词', 'vshi': '动词“是”', 'vx': '形式动词', 'vyou': '动词“有”', 'w': '标点符号', 'wd': '逗号', 'wky': '右括号', 'wkz': '左括号', 'wm': '冒号', 'wn': '顿号', 'wp': '破折号', 'ws': '省略号', 'wy': '引号', 'x': '字符串', 'y': '语气词', 'z': '状态词'}
 thulac_tags={'n': '名词', 'np': '人名', 'ns': '地名', 'ni': '机构名', 'nz': '其它专名', 'm': '数词', 'q': '量词', 'mq': '数量词', 't': '时间词', 'f': '方位词', 's': '处所词', 'v': '动词', 'a': '形容词', 'd': '副词', 'h': '前接成分', 'k': '后接成分', 'i': '习语', 'j': '简称', 'r': '代词', 'c': '连词', 'p': '介词', 'u': '助词', 'y': '语气助词', 'e': '叹词', 'o': '拟声词', 'g': '语素', 'w': '标点', 'x': '其它'}
+LangCodes={'AA': ['阿法尔语', 'Afar'], 'AB': ['阿布哈兹语', 'Abkhaz'], 'AE': ['阿维斯陀语', 'Avestan'], 'AF': ['阿非利堪斯语', 'Afrikaans'], 'AK': ['阿坎语', 'Akan, Twi-Fante'], 'AM': ['阿姆哈拉语', 'Amharic'], 'AN': ['阿拉贡语', 'Aragonese'], 'AR': ['阿拉伯语', 'Arabic'], 'AS': ['阿萨姆语', 'Assamese'], 'AV': ['阿瓦尔语', 'Avaric'], 'AY': ['艾马拉语', 'Aymara'], 'AZ': ['阿塞拜疆语', 'Azerbaijani'], 'BA': ['巴什基尔语', 'Bashkir'], 'BE': ['白俄罗斯语', 'Belarusian'], 'BG': ['保加利亚语', 'Bulgarian'], 'BH': ['比哈尔语', 'Bihari'], 'BI': ['比斯拉玛语', 'Bislama'], 'BM': ['班巴拉语', 'Bambara'], 'BN': ['孟加拉语', 'Bengali'], 'BO': ['藏语', 'Tibetan Standard, Central Tibetan'], 'BR': ['布列塔尼语', 'Breton'], 'BS': ['波斯尼亚语', 'Bosnian'], 'CA': ['加泰隆语', 'Catalan;\xa0Valencian'], 'CE': ['车臣语', 'Chechen'], 'CH': ['查莫罗语', 'Chamorro'], 'CO': ['科西嘉语', 'Corsican'], 'CR': ['克里语', 'Cree'], 'CS': ['捷克语', 'Czech'], 'CU': ['教会斯拉夫语', 'Old Church Slavonic, Church Slavic, Church Slavonic, Old Bulgarian, Old Slavonic'], 'CV': ['楚瓦什语', 'Chuvash'], 'CY': ['威尔士语', 'Welsh'], 'DA': ['丹麦语', 'Danish'], 'DE': ['德语', 'German'], 'DV': ['迪维希语', 'Divehi; Dhivehi; Maldivian;'], 'DZ': ['不丹语', 'Dzongkha'], 'EE': ['埃维语', 'Ewe'], 'EL': ['现代希腊语', 'Greek, Modern'], 'EN': ['英语', 'English'], 'EO': ['世界语', 'Esperanto'], 'ES': ['西班牙语', 'Spanish; Castilian'], 'ET': ['爱沙尼亚语', 'Estonian'], 'EU': ['巴斯克语', 'Basque'], 'FA': ['波斯语', 'Persian'], 'FF': ['富拉语', 'Fula; Fulah; Pulaar; Pular'], 'FI': ['芬兰语', 'Finnish'], 'FJ': ['斐济语', 'Fijian'], 'FO': ['法罗斯语', 'Faroese'], 'FR': ['法语', 'French'], 'FY': ['弗里西亚语', 'Western Frisian'], 'GA': ['爱尔兰语', 'Irish'], 'GD': ['盖尔语（苏格兰语）', 'Scottish Gaelic; Gaelic'], 'GL': ['加利西亚语', 'Galician'], 'GN': ['瓜拉尼语', 'Guaraní'], 'GU': ['古吉拉特语', 'Gujarati'], 'GV': ['马恩岛语', 'Manx'], 'HA': ['豪萨语', 'Hausa'], 'HE': ['希伯来语', 'Hebrew\xa0(modern)'], 'HI': ['印地语', 'Hindi'], 'HO': ['希里莫图语', 'Hiri Motu'], 'HR': ['克罗地亚语', 'Croatian'], 'HT': ['海地克里奥尔语', 'Haitian; Haitian Creole'], 'HU': ['匈牙利语', 'Hungarian'], 'HY': ['亚美尼亚语', 'Armenian'], 'HZ': ['赫雷罗语', 'Herero'], 'I.E.': ['国际语E', 'Interlingue'], 'IA': ['国际语A', 'Interlingua'], 'ID': ['印尼语', 'Indonesian'], 'IG': ['伊博语', 'Igbo'], 'II': ['四川彝语（诺苏语）', 'Nuosu'], 'IK': ['依努庇克语', 'Inupiaq'], 'IO': ['伊多语', 'Ido'], 'IS': ['冰岛语', 'Icelandic'], 'IT': ['意大利语', 'Italian'], 'IU': ['伊努伊特语', 'Inuktitut'], 'JA': ['日语', 'Japanese'], 'JV': ['爪哇语', 'Javanese'], 'KA': ['格鲁吉亚语', 'Georgian'], 'KG': ['刚果语', 'Kongo'], 'KI': ['基库尤语', 'Kikuyu, Gikuyu'], 'KJ': ['夸尼亚玛语', 'Kwanyama, Kuanyama'], 'KK': ['哈萨克语', 'Kazakh'], 'KL': ['格陵兰语', 'Kalaallisut, Greenlandic'], 'KM': ['高棉语', 'Khmer, Cambodian'], 'KN': ['坎纳达语', 'Kannada'], 'KO': ['朝鲜语', 'Korean'], 'KR': ['卡努里语', 'Kanuri'], 'KS': ['克什米尔语', 'Kashmiri'], 'KU': ['库尔德语', 'Kurdish'], 'KV': ['科米语', 'Komi'], 'KW': ['康沃尔语', 'Cornish'], 'KY': ['吉尔吉斯语', 'Kirghiz, Kyrgyz'], 'LA': ['拉丁语', 'Latin'], 'LB': ['卢森堡语', 'Luxembourgish, Letzeburgesch'], 'LG': ['干达语', 'Luganda'], 'LI': ['林堡语', 'Limburgish, Limburgan, Limburger'], 'LN': ['林加拉语', 'Lingala'], 'LO': ['老挝语', 'Lao'], 'LT': ['立陶宛语', 'Lithuanian'], 'LU': ['卢巴—加丹加语', 'Luba-Katanga'], 'LV': ['拉脱维亚语', 'Latvian'], 'MG': ['马达加斯加语', 'Malagasy'], 'MH': ['马绍尔语', 'Marshallese'], 'MI': ['毛利语', 'Māori'], 'MK': ['马其顿语', 'Macedonian'], 'ML': ['马拉亚拉姆语', 'Malayalam'], 'MN': ['蒙古语', 'Mongolian'], 'MR': ['马拉提语', 'Marathi (Marāṭhī)'], 'MS': ['马来语', 'Malay'], 'MT': ['马耳他语', 'Maltese'], 'MY': ['缅甸语', 'Burmese'], 'NA': ['瑙鲁语', 'Nauru'], 'NB': ['挪威布克摩尔语', 'Norwegian Bokmål'], 'ND': ['北恩德贝勒语', 'North Ndebele'], 'NE': ['尼泊尔语', 'Nepali'], 'NG': ['恩敦加语', 'Ndonga'], 'NL': ['荷兰语', 'Dutch'], 'NN': ['尼诺斯克挪威语', 'Norwegian Nynorsk'], 'NO': ['挪威语', 'Norwegian'], 'NR': ['南恩德贝勒语', 'South Ndebele'], 'NV': ['纳瓦霍语', 'Navajo, Navaho'], 'NY': ['尼扬贾语', 'Chichewa; Chewa; Nyanja'], 'OC': ['普罗旺斯语', 'Occitan'], 'OJ': ['奥吉布瓦语', 'Ojibwe, Ojibwa'], 'OM': ['阿芳•奥洛莫语', 'Oromo'], 'OR': ['奥利亚语', 'Oriya'], 'OS': ['奥塞梯语', 'Ossetian, Ossetic'], 'PA': ['旁遮普语', 'Panjabi, Punjabi'], 'PI': ['巴利语', 'Pāli'], 'PL': ['波兰语', 'Polish'], 'PS': ['普什图语', 'Pashto, Pushto'], 'PT': ['葡萄牙语', 'Portuguese'], 'QU': ['凯楚亚语', 'Quechua'], 'RM': ['罗曼语', 'Romansh'], 'RN': ['基隆迪语', 'Kirundi'], 'RO': ['罗马尼亚语', 'Romanian,\xa0Moldavian, Moldovan'], 'RU': ['俄语', 'Russian'], 'RW': ['基尼阿万达语', 'Kinyarwanda'], 'SA': ['梵语', 'Sanskrit (Saṁskṛta)'], 'SC': ['撒丁语', 'Sardinian'], 'SD': ['信德语', 'Sindhi'], 'SE': ['北萨摩斯语', 'Northern Sami'], 'SG': ['桑戈语', 'Sango'], 'SI': ['僧加罗语', 'Sinhala, Sinhalese'], 'SK': ['斯洛伐克语', 'Slovak'], 'SL': ['斯洛文尼亚语', 'Slovene'], 'SM': ['萨摩亚语', 'Samoan'], 'SN': ['绍纳语', 'Shona'], 'SO': ['索马里语', 'Somali'], 'SQ': ['阿尔巴尼亚语', 'Albanian'], 'SR': ['塞尔维亚语', 'Serbian'], 'SS': ['塞斯瓦特语', 'Swati'], 'ST': ['南索托语', 'Southern Sotho'], 'SU': ['巽他语', 'Sundanese'], 'SV': ['瑞典语', 'Swedish'], 'SW': ['斯瓦希里语', 'Swahili'], 'TA': ['泰米尔语', 'Tamil'], 'TE': ['泰卢固语', 'Telugu'], 'TG': ['塔吉克语', 'Tajik'], 'TH': ['泰语', 'Thai'], 'TI': ['提格里尼亚语', 'Tigrinya'], 'TK': ['土库曼语', 'Turkmen'], 'TL': ['他加禄语', 'Tagalog'], 'TN': ['塞茨瓦纳语', 'Tswana'], 'TO': ['汤加语', 'Tongan'], 'TR': ['土耳其语', 'Turkish'], 'TS': ['宗加语', 'Tsonga'], 'TT': ['塔塔尔语', 'Tatar'], 'TW': ['特威语', 'Twi'], 'TY': ['塔希提语', 'Tahitian'], 'UG': ['维吾尔语', 'Uighur, Uyghur'], 'UK': ['乌克兰语', 'Ukrainian'], 'UR': ['乌尔都语', 'Urdu'], 'UZ': ['乌兹别克语', 'Uzbek'], 'VE': ['文达语', 'Venda'], 'VI': ['越南语', 'Vietnamese'], 'VO': ['沃拉普克语', 'Volapük'], 'WA': ['瓦隆语', 'Walloon'], 'WO': ['沃洛夫语', 'Wolof'], 'XH': ['科萨语', 'Xhosa'], 'YI': ['依地语', 'Yiddish'], 'YO': ['约鲁巴语', 'Yoruba'], 'ZA': ['壮语', 'Zhuang, Chuang'], 'ZH': ['汉语（中文）', 'Chinese'], 'ZU': ['祖鲁语', 'Zulu']}
 def word_list(split_words):
     """
     Parameters
@@ -482,13 +484,8 @@ def batch_word_list(input_root):
     ----------
     input_root : TYPE string
         DESCRIPTION.
-            It's a folder path like D:\seg_only
-            Noted that all text files in the folder should be tokenized texts.
-            For example, the text of D:\seg_only\1.txt should be like:
-                PgsFile is Python library to facilitate Python beginners ,
-                especially instructors and students of foreign languages and literature,
-                for the convenience of easily operating txt ,
-                xlsx and json files as well as making word list .
+            It's a folder path like seg_only.
+            Based on tokenized text.
     Returns
     -------
@@ -505,7 +502,7 @@ def batch_word_list(input_root):
             ('literature', [1, 1]),]
     '''
     from PgsFile import get_data_text as gt, FilePath as fp, BigPunctuation as bp
-    # input_root=r"D:\047_Scraping\seg_only"
+    # input_root=r"047_Scraping\seg_only"
     file_names=fp(input_root)
     from collections import defaultdict
@@ -749,11 +746,10 @@ def cs(para):
     # import zhon
     # rst=re.findall(zhon.hanzi.sentence, para)
     # return rst  #['我买了一辆车。', '妈妈做的菜，很好吃！']
-    para=re.sub('([。！？\?])([^”’])', r"\1\n\2", para)  # 单字符断句符
-    para=re.sub('(\.{6})([^”’])', r"\1\n\2", para)  # 英文省略号
-    para=re.sub('(\…{2})([^”’])', r"\1\n\2", para)  # 中文省略号
-    para=re.sub('([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
+    para=re.sub(r'([。！？\?])([^”’])', r"\1\n\2", para)  # 单字符断句符
+    para=re.sub(r'(\.{6})([^”’])', r"\1\n\2", para)  # 英文省略号
+    para=re.sub(r'(\…{2})([^”’])', r"\1\n\2", para)  # 中文省略号
+    para=re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
     # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
     para=para.rstrip()  # 段尾如果有多余的\n就去掉它
     # 很多规则中会考虑分号;，但是这里我把它忽略不计，破折号、英文双引号等同样忽略，需要的再做些简单调整即可。
@@ -784,7 +780,7 @@ def cs2(text):
     alphabets="([A-Za-z])"
     prefixes="(Mr|St|Mrs|Ms|Dr)[.]"
     suffixes="(Inc|Ltd|Jr|Sr|Co)"
-    starters="(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+    starters=r"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
     acronyms="([A-Z][.][A-Z][.](?:[A-Z][.])?)"
     websites="[.](com|net|org|io|gov)"
     digits="([0-9])"
@@ -796,7 +792,7 @@ def cs2(text):
     text=re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
     if "..." in text: text=text.replace("...","<prd><prd><prd>")
     if "Ph.D" in text: text=text.replace("Ph.D.","Ph<prd>D<prd>")
-    text=re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
+    text=re.sub(r"\s" + alphabets + "[.] "," \\1<prd> ",text)
     text=re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
     text=re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
     text=re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
@@ -1050,7 +1046,7 @@ def replace_english_punctuation_with_chinese(text):
     # 定义英文标点和对应的中文标点的映射关系
     punctuation_mapping={
         ',': '，',
-        '.': '。',
+        # '.': '。', # 去掉！
         '?': '？',
         '!': '！',
         ';': '；',
@@ -1073,8 +1069,8 @@ def extract_misspelled_words_from_docx(file_path, mode=None):
     Parameters
     ----------
     file_path : TYPE string
-        DESCRIPTION. r"D:\Docs\Metrics for Translation Quality Assessment_A Case for Standardising Error Typologies.docx"
-    mode : TYPE, optional sting
+        DESCRIPTION. r"DocsMetrics for Translation Quality Assessment_A Case for Standardising Error Typologies.docx"
+    mode : TYPE, optional string
         DESCRIPTION.
         1. The default is None, which means extracting all words with double underlines and wavy lines.
         2. The "spell" mode means extracting all words with wavy red lines.
@@ -1146,13 +1142,13 @@ def get_text_length_kb(text: str) -> str:
     Get the length of a text string in KB (kilobytes, eg.26.5 KB).
     """
     # Get the length of the text in bytes
-    text_bytes = len(text.encode('utf-8'))
+    text_bytes=len(text.encode('utf-8'))
     # Convert the length to KB
-    text_kb = text_bytes / 1024
-    rounded_num = round(text_kb, 2)
+    text_kb=text_bytes / 1024
+    rounded_num=round(text_kb, 2)
-    text_kb = f'{rounded_num} KB'
+    text_kb=f'{rounded_num} KB'
     print(type(text_kb))
     return text_kb
@@ -1179,8 +1175,8 @@ def generate_password(length: int) -> str:
     """
     import random
     # Define the set of characters to choose from
-    character_set = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+=-"
-    random_password = ''.join(random.choice(character_set) for _ in range(length))
+    character_set="1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+=-"
+    random_password=''.join(random.choice(character_set) for _ in range(length))
     return random_password
@@ -1196,14 +1192,14 @@ def extract_numbers(string: str) -> list:
     """
     import re
     # Define a regular expression to match one or more digits
-    digit_pattern = re.compile(r'(\d+)')
+    digit_pattern=re.compile(r'(\d+)')
     # Split the input string using the regular expression
-    fragments = digit_pattern.split(string)
+    fragments=digit_pattern.split(string)
     # Convert every other fragment to an integer (the ones that match the digit pattern)
     for i in range(1, len(fragments), 2):
-        fragments[i] = int(fragments[i])
+        fragments[i]=int(fragments[i])
     return fragments
@@ -1220,7 +1216,7 @@ def sort_strings_with_embedded_numbers(strings: list) -> list:
         list: A new list containing the sorted strings.
     """
     # Sort the strings using the extract_numbers() function as the key
-    sorted_strings = sorted(strings, key=extract_numbers)
+    sorted_strings=sorted(strings, key=extract_numbers)
     return sorted_strings
@@ -1237,10 +1233,10 @@ def run_command(command: str) -> str:
     """
     import subprocess
     # Run the command and capture the output
-    output = subprocess.check_output(command, shell=True)
+    output=subprocess.check_output(command, shell=True)
     # Decode the output from bytes to string
-    output_str = output.decode()
+    output_str=output.decode()
     return output_str
@@ -1248,6 +1244,7 @@ def run_command(command: str) -> str:
 import random
 import requests
 from lxml import html, etree
+import pandas as pd
 my_headers={"User-Agent": random.choice(yhd)}
 class PGScraper(object):
     def __init__(self):
@@ -1260,31 +1257,27 @@ class PGScraper(object):
         valid_xpath=[]
         valid_span=[]
         # Example HTML content
-        if headers is None:
-            real_headers=my_headers
-        else:
-            real_headers=headers
         if timeout is None:
             real_timeout=24.0
         else:
             real_timeout=timeout
-        r=requests.get(url,timeout=real_timeout,headers=real_headers, cookies=cookies, params=params, proxies=proxies)
+        r=requests.get(url,timeout=real_timeout,headers=headers, cookies=cookies, params=params, proxies=proxies)
         if r.status_code==200:
             r.encoding="utf-8"
             html_content=r.content
             # Parse HTML content
-            tree = html.fromstring(html_content)
+            tree=html.fromstring(html_content)
             relative_xpaths=[]
             for text in want_list:
                 # Find elements containing the text
-                elements = tree.xpath(f"//*[contains(text(), '{text}')]")
+                elements=tree.xpath(f"//*[contains(text(), '{text}')]")
                 if not elements:
                     return None
                 # Assume we want the first matching element
-                element = elements[0]
-                absolute_xpath = tree.getroottree().getpath(element)
+                element=elements[0]
+                absolute_xpath=tree.getroottree().getpath(element)
                 relative_xpaths.append(absolute_xpath)
             path1=relative_xpaths[0]
@@ -1312,7 +1305,7 @@ class PGScraper(object):
                                 all_want_list.append(clean_list(target_eles))
                                 valid_xpath.append(my_path)
                         except:
-                            error_type, value, traceback = sys.exc_info()
+                            error_type, value, traceback=sys.exc_info()
                             error_info=f'{error_type}\n{value}\n{traceback}'
                             print(error_info)
@@ -1345,7 +1338,7 @@ class PGScraper(object):
                                 all_want_list.append((clean_list(target_eles),clean_list(target_url_eles)))
                                 valid_xpath.append((my_path,my_path_url))
                         except:
-                            error_type, value, traceback = sys.exc_info()
+                            error_type, value, traceback=sys.exc_info()
                             error_info=f'{error_type}\n{value}\n{traceback}'
                             print(error_info)
@@ -1378,21 +1371,17 @@ class PGScraper(object):
     def get_similar_text(self, url, timeout=None, headers=None, cookies=None, params=None, proxies=None):
         all_want_list=[]
         # Example HTML content
-        if headers is None:
-            real_headers=my_headers
-        else:
-            real_headers=headers
         if timeout is None:
             real_timeout=24.0
         else:
             real_timeout=timeout
-        r=requests.get(url,timeout=real_timeout,headers=real_headers, cookies=cookies, params=params, proxies=proxies)
+        r=requests.get(url, timeout=real_timeout, headers=headers, cookies=cookies, params=params, proxies=proxies)
         if r.status_code==200:
             r.encoding="utf-8"
             html_content=r.content
             # Parse HTML content
-            tree = html.fromstring(html_content)
+            tree=html.fromstring(html_content)
             if self.show_url==True:
                 for pat,url in self.pattern:
                     target_eles=tree.xpath(pat)
@@ -1406,3 +1395,337 @@ class PGScraper(object):
         else:
             print(r.status_code,"invalid url",url)
             return all_want_list
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Sep 17 16:11:45 2020
+Showing download progress and speed when audio-visual files like MP4, MP3, JPG etc are downloading!
+@author: Petercusin
+"""
+import time
+from contextlib import closing
+def audiovisual_downloader(url, path):
+    with closing(requests.get(url, stream=True, headers=my_headers)) as r:
+        chunk_size=1024*10
+        content_size=int(r.headers['content-length'])
+        print('Initiating download...')
+        with open(path, "wb") as f:
+            p=ProgressData(size=content_size, unit='Kb', block=chunk_size)
+            for chunk in r.iter_content(chunk_size=chunk_size):
+                f.write(chunk)
+                p.output()
+class ProgressData(object):
+    def __init__(self, block, size, unit, file_name='', ):
+        self.file_name=file_name
+        self.block=block/1000.0
+        self.size=size/1000.0
+        self.unit=unit
+        self.count=0
+        self.start=time.time()
+    def output(self):
+        self.end=time.time()
+        self.count += 1
+        speed=self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
+        self.start=time.time()
+        loaded=self.count*self.block
+        progress=round(loaded/self.size, 4)
+        if loaded >= self.size:
+            print(u'%sYour download has finished successfully.\r\n'%self.file_name)
+        else:
+            print(u'{0}Download Progress: {1:.2f}{2}/{3:.2f}{4} {5:.2%} Download Speed: {6:.2f}{7}/s'.\
+                  format(self.file_name, loaded, self.unit,\
+                  self.size, self.unit, progress, speed, self.unit))
+            print('%50s'%('/'*int((1-progress)*50)))
+def levenshtein_distance(s, t):
+	m, n=len(s), len(t)
+	if m < n:
+		s, t=t, s
+		m, n=n, m
+	d=[list(range(n + 1))] + [[i] + [0] * n for i in range(1, m + 1)]
+	for j in range(1, n + 1):
+		for i in range(1, m + 1):
+			if s[i - 1]==t[j - 1]:
+				d[i][j]=d[i - 1][j - 1]
+			else:
+				d[i][j]=min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1
+	return d[m][n]
+def compute_similarity(input_string, reference_string):
+	distance=levenshtein_distance(input_string, reference_string)
+	max_length=max(len(input_string), len(reference_string))
+	similarity=1 - (distance / max_length)
+	return similarity
+pgs_abbres_words=['A.B.','A.D.','A.G.','A.I.','A.M.','A.P.','A.V.','AFP.','Ala.','Alta.','Apr.','Ariz.','Ark.','Assn.','Aug.','Ave.','B.A.','B.C','B.C.','B.Ed.','B.I.G','B.R.','B.S.','Blvd.','Brig.','Brig.-Gen.','Bros.','C.D.','C.E.O','C.I.A.','C.M.','C.V.','Calif.','Capt.','Cf.','Ch.','Cie.','Cir.','Cllr.','Cmdr.','Co.','Co.Design','Col.','Colo.','Conn.','Corp.','Cos.','Coun.','Cpl.','Cres.','D.C.','D.D.S.','D.J.','D.K.','D.S.','Dec.','Del.','Dept.','Det.','Dr.','E.B.','E.C.','E.ON','E.U.','E.coli','E.g.','Ed.','Esq.','F.C.','Feb.','Fig.','Fla.','Fri.','G.K.','G.M.','G.Skill','Ga.','Gen.','Gov.','Govt.','H.E.','H.L.','H.S.','Hon.','Hwy.','I.T.','I.e.','Ill.','Inc.','Ind.','J.Crew','J.D.','J.G.','J.P','J.R.R.','Jan.','Jr.','Jul.','Jun.','K.C.','K.J.','K.M.','K.N.','K.P.','K.R.','Kan.','Ky.','L.A.','L.L.','L.S.','LLC.','La.','Lieut.','Lt.','Lt.-Cmdr.','Lt.-Col.','Lt.-Gen.','Ltd.','M.A.','M.B.','M.B.A.','M.D.','M.E.N','M.I.A.','M.J.','M.M.','M.P.','M.S.','Maj.','Maj.-Gen.','Man.','Mar.','Mass.','Md.','Messrs.','Mfg.','Mfrs.','Mich.','Minn.','Miss.','Mmes.','Mo.','Mon.','Mr.','Mrs.','Ms.','Msgr.','Mss.','N.A.','N.B.','N.C.','N.D.','N.H.','N.J.','N.L.','N.M.','N.S.','N.W.A.','N.W.T.','N.Y.','Neb.','Nev.','No.','Nos.','Nov.','O.C.','O.K.','O.S.','Oct.','Okla.','Ont.','Op.','Ore.','P.C.','P.E.','P.E.I.','P.K.','P.M.','P.O.','P.R.','P.S.','Pa.','Ph.D','Ph.D.','Plc.','Pres.','Prof.','Psy.D.','Pte.','Que.','R.E.M.','R.I.','R.I.P.','R.M','R.R.','Rd.','Rep.','Rev.','Rs.','Rt.','S.A.','S.C.','S.D.','S.F.','S.H.I.E.L.D.','S.K.','S.League','S.M.','S.P.','Sask.','Sat.','Sec.','Sen.','Sep.','Sgt.','Sr.','St.','Ste.','Sub-Lieut.','Sun.','Supt.','T.A.','T.R.','T.V.','TV.','Tenn.','Tex.','Thu.','Tue.','Twp.','U.A.E.','U.K.','U.N','U.P.','U.S','U.S.','U.S.A.','U.S.C.','UK.','US.','V.P.','Va.','Vol.','Vt.','W.H.O.','W.Va.','Wash.','Wed.','Wis.','Y.T.','a.m.','abr.','anon.','bk.','bks.','bull.','c.','ca.','cf.','ch.','def.','e.g.','ed.','eds.','et al.','etc.','fig.','ft.','fwd.','gal.','i.e.','ibid.','illus.','in.','jour.','lb.','mag.','mi.','ms.','mss.','no.','oz.','p.','p.m.','pg.','pgs.','pp.','pseud.','pt.','pts.','pub.','qt.','qtd.','ser.','supp.','trans.','viz.','vol.','vols.','vs.','yd.']
+def clean_text(text): #清洗除了句号以外的其他标点符号问题
+    # 在标点符号右边邻接单词前添加空格
+    import re
+    text=replace_chinese_punctuation_with_english(text)
+    text=re.sub(r'(?<=[\?\!\,\;\:\)\]\}])\s*(?=\w)', ' ', text)
+    # 删除标点符号与左边单词之间的空格
+    text=re.sub(r'\s*([\?\!\,\;\:\)\]\}\>])', r'\1', text)
+    # 删除标点符号与右边单词之间的空格
+    text=re.sub(r'\s*\(\s*', r' (', text)
+    text=re.sub(r'\s*\[\s*', r' [', text)
+    text=re.sub(r'\s*\{\s*', r' {', text)
+    text=re.sub(r'\s*\<\s*', r' <', text)
+    # 处理多余的空格
+    text=re.sub(r'\s{2,}', ' ', text)
+    text=re.sub(r'-{2,}', '-', text)
+    return text
+def clean_text_with_abbreviations(text):
+    import re
+    text=clean_text(text)
+    matches=[]
+    for seg in text.split():
+        if "." in seg:
+            if seg.endswith(".") is False:
+                matches.append(seg)
+            elif seg.endswith("..") and "..." not in seg:
+                text=text.replace("..", ".")
+    for match in matches:
+        if any(word in match for word in pgs_abbres_words):
+            inter=match.split(".")
+            new_match="".join([w+"." for w in inter[0:-1]])+" "+inter[-1]
+            text=text.replace(match, new_match)
+        else:
+            text=text.replace(match, match.replace(".",". "))
+    text=re.sub(r'\s+\.', '.', text)
+    return text
+import shutil
+def move_file(source_file, destination_folder, new_file_name=None):
+    """
+    Move/cut a file to another folder.
+    Parameters:
+    source_file (str): The path to the source file.
+    destination_folder (str): The path to the destination folder.
+    new_file_name (str, optional): The new name for the file in the destination folder. Defaults to None.
+    """
+    # Ensure the destination folder exists
+    if not os.path.exists(destination_folder):
+        os.makedirs(destination_folder)
+    # Construct the destination file path
+    if new_file_name:
+        destination_file=os.path.join(destination_folder, new_file_name)
+    else:
+        destination_file=os.path.join(destination_folder, os.path.basename(source_file))
+    # Move the file to the destination folder
+    shutil.move(source_file, destination_file)
+    print(f"File moved from {source_file} to {destination_file}")
+def check_empty_cells(file_path):
+    """
+    Check for any empty cells in an Excel file and return their exact positions.
+    Parameters:
+    file_path (str): The path to the Excel file.
+    Returns:
+    list of tuples: A list of tuples where each tuple contains the column ID and row ID of an empty cell. If no empty cells are found, an empty list is returned.
+    Example:
+    empty_cells=check_empty_cells('your_file.xlsx')
+    if empty_cells:
+        print(f"Empty cells found at positions: {empty_cells}")
+    else:
+        print("No empty cells found.")
+    """
+    # Read the Excel file
+    df=pd.read_excel(file_path)
+    # Initialize a list to store the positions of empty cells
+    empty_cells=[]
+    # Iterate over the DataFrame to find empty cells
+    for row_id, row in df.iterrows():
+        for col_id, value in row.items():
+            if pd.isnull(value):
+                empty_cells.append((col_id, row_id))
+    return empty_cells
+def makefile(file_path):
+    if os.path.exists(file_path):
+        pass
+    else:
+        write_to_txt(file_path, "")
+def save_dict_to_excel(data, output_file, headers=None):
+    """
+    Save Python dictionary data into an Excel .xlsx file with custom headers.
+    Parameters:
+    data (dict): The dictionary containing the data to be saved.
+    output_file (str): The path to the output Excel file.
+    headers (list of str, optional): A list of strings representing the headers for the Excel file. Defaults to ['Key', 'Value'] if not provided.
+    Returns:
+    None
+    Example:
+    data={'key1': 'value1', 'key2': 'value2'}
+    output_file='output.xlsx'
+    save_dict_to_excel(data, output_file)  # Uses default headers
+    save_dict_to_excel(data, output_file, headers=['Source Text', 'Target Text'])  # Uses custom headers
+    """
+    if headers is None:
+        headers=['Key', 'Value']
+    elif len(headers) != 2:
+        raise ValueError("Headers list must contain exactly 2 elements.")
+    # Convert the dictionary to a DataFrame
+    df=pd.DataFrame(list(data.items()), columns=headers)
+    # Save the DataFrame to an Excel file
+    df.to_excel(output_file, index=False)
+def len_rows(file_path):
+    """
+    Calculate the number of rows in an Excel file based on the largest row number of any possible columns.
+    Parameters:
+    file_path (str): The path to the Excel file.
+    Returns:
+    int: The number of rows in the Excel file.
+    """
+    # Read the Excel file
+    df=pd.read_excel(file_path)
+    # Get the number of rows
+    row_count=df.shape[0]
+    return row_count
+def format_float(number, decimal_places=2):
+    """
+    Format a float to a specified number of decimal places.
+    Parameters:
+    number (float): The float number to be formatted.
+    decimal_places (int, optional): The number of decimal places to format the number to. Defaults to 2.
+    Returns:
+    str: The formatted number as a string with the specified number of decimal places.
+    Example:
+    formatted_number=format_float(3.1415926535)
+    print(formatted_number)  # Output: 3.14
+    formatted_number=format_float(3.1415926535, 4)
+    print(formatted_number)  # Output: 3.1416
+    """
+    formatted_number="{:.{precision}f}".format(number, precision=decimal_places)
+    return formatted_number
+def get_data_html_offline(file_path):
+    """
+    Reads a local HTML file and extracts specific elements.
+    Parameters:
+    file_path (str): The path to the local HTML file. my_html="Top 5 Web Scraping Methods_ Including Using LLMs - Comet.mhtml"
+    Returns: html
+    XPath common usages:
+    rst = html.xpath('//div[@class="image-caption"]/text()')  # Get the text content of the specified tag
+    rst = html.xpath('//div[@class="image-caption"]/text()')  # Get the text content of the specified tag
+    rst1 = html.xpath('//div[@class="_16zCst"]/h1/text()')
+    rst2 = html.xpath('//p[1]/text()')  # Get the text content of the first p node
+    rst3 = html.xpath('//p[position()<3]/text()')  # Get the text content of the first two p nodes
+    rst4 = html.xpath('//p[last()]/text()')  # Get the text content of the last p node
+    rst5 = html.xpath('//a[2]/@href')  # Get the href attribute of the second a node
+    """
+    if file_path.endswith(".mhtml"):
+        import pimht
+        mhtml = pimht.from_filename(file_path)
+        longest_length = 0
+        html_content = ""
+        for mhtml_part in mhtml:
+            if "text/html" in mhtml_part.content_type:
+                possible_html=mhtml_part.text
+                current_length = len(possible_html)
+                if current_length > longest_length:
+                    longest_length = current_length
+                    html_content = possible_html
+        # Parse the HTML content
+        html = etree.HTML(html_content)
+    else: #.html
+        html=etree.parse(file_path,etree.HTMLParser())
+    return html
+def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=None, params=None, proxies=None):
+    '''
+    rst = html.xpath('//div[@class="image-caption"]/text()')  # Get the text content of the specified tag
+    rst = html.xpath('//div[@class="image-caption"]/text()')  # Get the text content of the specified tag
+    rst1 = html.xpath('//div[@class="_16zCst"]/h1/text()')
+    rst2 = html.xpath('//p[1]/text()')  # Get the text content of the first p node
+    rst3 = html.xpath('//p[position()<3]/text()')  # Get the text content of the first two p nodes
+    rst4 = html.xpath('//p[last()]/text()')  # Get the text content of the last p node
+    rst5 = html.xpath('//a[2]/@href')  # Get the href attribute of the second a node
+    '''
+    # Example HTML content
+    if timeout is None:
+        real_timeout=24.0
+    else:
+        real_timeout=timeout
+    try:
+        time.sleep(round(random.uniform(1.0, 3.9), 19))
+        r=requests.get(url, timeout=real_timeout, headers=headers, cookies=cookies, params=params, proxies=proxies)
+        print(r.status_code) # print the reponse status code
+        if r.status_code==200:
+            if html==False:
+                return r
+            else:
+                r.encoding="utf-8"
+                data=r.text
+                html=etree.HTML(data)
+                return html
+        else:
+            print(r.status_code, "Can not find the page!")
+            return None
+    except Exception as err:
+        print(err)
+def find_table_with_most_rows(tables):
+    max_rows=0
+    max_table_index=-1
+    for i, table in enumerate(tables):
+        if isinstance(table, pd.DataFrame) and table.shape[0] > max_rows:
+            max_rows=table.shape[0]
+            max_table_index=i
+    return max_table_index, max_rows if max_table_index!= -1 else None
+def get_data_table(url, output_file, most_rows=True):
+    try:
+        tables=pd.read_html(url)
+        if most_rows==False:
+            # 1. default: the first table
+            df=tables[0]
+        else:
+            # 2. get the table with most rows
+            target_table=find_table_with_most_rows(tables)[0] #  (1, 32)
+            df=tables[target_table]
+        df.to_excel(output_file, index=False)
+        print(f"Data has been saved to {output_file}")
+    except Exception as err:
+        print(f"Errors found! {err}")
+        return None

PgsFile/__init__.py CHANGED Viewed

@@ -1,29 +1,45 @@
+# 1. Web scraping
 from .PgsFile import PGScraper
+from .PgsFile import audiovisual_downloader
+# 2. Package/library management
 from .PgsFile import install_package, uninstall_package
 from .PgsFile import run_script, run_command
+# 3. Text data retrieval
 from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
 from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
+from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
-from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines
+# 4. Text data storage
+from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, save_dict_to_excel
-from .PgsFile import FilePath, FileName, makedirec, get_subfolder_path, get_package_path, DirList
+# 5. File/folder process
+from .PgsFile import FilePath, FileName, DirList
+from .PgsFile import get_subfolder_path, get_package_path
+from .PgsFile import makedirec, makefile
 from .PgsFile import source_path, next_folder_names, corpus_root, get_directory_tree_with_meta, find_txt_files_with_keyword
-from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line
+from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
-from .PgsFile import BigPunctuation, StopTags, Special
+# 6. Data cleaning
+from .PgsFile import BigPunctuation, StopTags, Special, yhd
 from .PgsFile import ZhStopWords, EnPunctuation, extract_stopwords
-from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008
+from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
+from .PgsFile import check_contain_chinese, check_contain_number
+from .PgsFile import replace_chinese_punctuation_with_english
+from .PgsFile import replace_english_punctuation_with_chinese
+from .PgsFile import clean_list, clean_text_with_abbreviations
+from .PgsFile import extract_chinese_punctuation, generate_password, sort_strings_with_embedded_numbers
-from .PgsFile import ngrams, bigrams, trigrams, everygrams
+# 7. NLP (natural language processing)
+from .PgsFile import strQ2B_raw, strQ2B_words
+from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
 from .PgsFile import word_list, batch_word_list
 from .PgsFile import cs, cs1, cs2
-from .PgsFile import strQ2B_raw, strQ2B_words, Percentage, decimal_to_percent, get_text_length_kb, extract_numbers
-from .PgsFile import check_contain_chinese, check_contain_number
-from .PgsFile import replace_chinese_punctuation_with_english
-from .PgsFile import replace_english_punctuation_with_chinese
-from .PgsFile import clean_list, yhd, extract_chinese_punctuation, generate_password, sort_strings_with_embedded_numbers
+# 8. Maths
+from .PgsFile import len_rows, check_empty_cells
+from .PgsFile import format_float, decimal_to_percent, Percentage
+from .PgsFile import get_text_length_kb, extract_numbers
 name = "PgsFile"

{PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: PgsFile
-Version: 0.1.4
-Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
+Version: 0.1.6
+Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
 Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
 Author: Pan Guisheng
 Author-email: 895284504@qq.com
@@ -18,8 +18,9 @@ Requires-Dist: python-docx
 Requires-Dist: pip
 Requires-Dist: requests
 Requires-Dist: lxml
+Requires-Dist: pimht
-Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
+Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
 Function 1: Enables efficient data retrieval and storage in files with a single line of code.
@@ -34,7 +35,7 @@ Function 5: This library provides support for common text cleaning tasks, such a
 Function 6: It also manages Python package installations and uninstallations, and allows running scripts and commands in Python interactive command lines instead of Windows command prompt.
-Function 7: This is extremely beneficial for scraping newspaper data using PGScraper.
+Function 7: Download audiovisual files like videos, images, and audio using audiovisual_downloader, which is extremely useful and efficient. Additionally, scrape newspaper data with PGScraper, a highly efficient tool for this purpose.
 Table 1: The directory and size of Pgs-Corpora
 ├── Idioms (1, 171.78 KB)

{PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-PgsFile/PgsFile.py,sha256=MJDO16jtUglvGNCosdfeILa96UGPiShCp52VAJ0PsVg,59834
-PgsFile/__init__.py,sha256=bu0BNgzYNRNpbZAFhvLo-S3npg8kasLam1n9VkiCQNQ,1546
+PgsFile/PgsFile.py,sha256=jmSiczDE5cV47tHpCGDwLn19C90NGQtQ2vEn4ys4NUg,80514
+PgsFile/__init__.py,sha256=EKhIRd2tktjyrvBlBPgQsIJTqU7DdLIobNG8gEiZ--0,2163
 PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
 PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
 PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
 PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
 PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
 PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
-PgsFile-0.1.4.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
-PgsFile-0.1.4.dist-info/METADATA,sha256=nF-Qqjy5N7mXswO_HrASmMFEBXjmI_jNjr5Z3OcxemI,4697
-PgsFile-0.1.4.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
-PgsFile-0.1.4.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
-PgsFile-0.1.4.dist-info/RECORD,,
+PgsFile-0.1.6.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
+PgsFile-0.1.6.dist-info/METADATA,sha256=T0mBPq7PnljEcGjLItIJ3RIcZk7veOuy0vVgLuo31lo,4902
+PgsFile-0.1.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
+PgsFile-0.1.6.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
+PgsFile-0.1.6.dist-info/RECORD,,

{PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.40.0)
+Generator: bdist_wheel (0.44.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

PgsFile 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

PgsFile 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl