PgsFile 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PgsFile/PgsFile.py +380 -57
- PgsFile/__init__.py +27 -11
- {PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/METADATA +5 -4
- {PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/RECORD +7 -7
- {PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/WHEEL +1 -1
- {PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/LICENSE +0 -0
- {PgsFile-0.1.4.dist-info → PgsFile-0.1.6.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -68,7 +68,7 @@ def get_data_text(path):
|
|
|
68
68
|
----------
|
|
69
69
|
path : TYPE string
|
|
70
70
|
DESCRIPTION.
|
|
71
|
-
Using path to get data from a single txt file. eg.
|
|
71
|
+
Using path to get data from a single txt file. eg. raw_text.txt
|
|
72
72
|
Theoretically, it supports all the text encoding formats, like utf-8, unicode, ansi, gbk etc.
|
|
73
73
|
|
|
74
74
|
Returns
|
|
@@ -95,7 +95,7 @@ def get_data_lines(path):
|
|
|
95
95
|
----------
|
|
96
96
|
path : TYPE string
|
|
97
97
|
DESCRIPTION.
|
|
98
|
-
Using path to get data from a single txt file. eg.
|
|
98
|
+
Using path to get data from a single txt file. eg. raw_text.txt
|
|
99
99
|
Theoretically, it supports all the text encoding formats, like utf-8, unicode, ansi, gbk etc.
|
|
100
100
|
|
|
101
101
|
Returns
|
|
@@ -152,7 +152,7 @@ def get_data_excel(excel_path,column_id,sheet_name=None):
|
|
|
152
152
|
Parameters
|
|
153
153
|
----------
|
|
154
154
|
excel_path : TYPE
|
|
155
|
-
DESCRIPTION.
|
|
155
|
+
DESCRIPTION. data_python.xlsx
|
|
156
156
|
|
|
157
157
|
column_id : TYPE Int 0,1,2,3
|
|
158
158
|
DESCRIPTION. 0 means the first column, 1 means the second.
|
|
@@ -180,7 +180,7 @@ def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
|
|
|
180
180
|
Parameters
|
|
181
181
|
----------
|
|
182
182
|
excel_path : TYPE
|
|
183
|
-
DESCRIPTION.
|
|
183
|
+
DESCRIPTION. results.xlsx
|
|
184
184
|
|
|
185
185
|
dic_of_list : TYPE
|
|
186
186
|
DESCRIPTION. {"col":["a","b","c","d"],"freq":[1,2,3,4]}
|
|
@@ -233,7 +233,7 @@ def get_tsv_lines(csv_path, delimiter=None):
|
|
|
233
233
|
'''
|
|
234
234
|
Parameters
|
|
235
235
|
----------
|
|
236
|
-
get_tsv_lines : TYPE
|
|
236
|
+
get_tsv_lines : TYPE data.tsv
|
|
237
237
|
DESCRIPTION.
|
|
238
238
|
|
|
239
239
|
Returns
|
|
@@ -261,7 +261,7 @@ def get_data_json(json_path):
|
|
|
261
261
|
'''
|
|
262
262
|
Parameters
|
|
263
263
|
----------
|
|
264
|
-
json_path : TYPE
|
|
264
|
+
json_path : TYPE data.json
|
|
265
265
|
DESCRIPTION.
|
|
266
266
|
|
|
267
267
|
Returns
|
|
@@ -285,7 +285,7 @@ def get_json_lines(json_path):
|
|
|
285
285
|
'''
|
|
286
286
|
Parameters
|
|
287
287
|
----------
|
|
288
|
-
json_path : TYPE
|
|
288
|
+
json_path : TYPE data.json
|
|
289
289
|
DESCRIPTION.
|
|
290
290
|
|
|
291
291
|
Returns
|
|
@@ -308,7 +308,7 @@ def write_to_json(json_path,my_dic):
|
|
|
308
308
|
Parameters
|
|
309
309
|
----------
|
|
310
310
|
json_path : TYPE string
|
|
311
|
-
DESCRIPTION.
|
|
311
|
+
DESCRIPTION. data.json
|
|
312
312
|
|
|
313
313
|
my_dic : TYPE dict or list
|
|
314
314
|
DESCRIPTION.
|
|
@@ -332,7 +332,7 @@ def write_to_json_lines(json_path,my_json_data):
|
|
|
332
332
|
Parameters
|
|
333
333
|
----------
|
|
334
334
|
json_path : TYPE string
|
|
335
|
-
DESCRIPTION.
|
|
335
|
+
DESCRIPTION. data.json
|
|
336
336
|
|
|
337
337
|
my_json_data : TYPE dict or list
|
|
338
338
|
DESCRIPTION.
|
|
@@ -449,7 +449,7 @@ def get_subfolder_path(parent_folder, subfolder_name):
|
|
|
449
449
|
BigPunctuation="""!"#$&\'()*+,-/:;<=>?@[\\]^_`{|}.%~"#$%&'?。()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。``''""" #除去英文标点.%
|
|
450
450
|
StopTags="""◆: 、/ 。/ ---/ -/ --/ -- :/ ;/ ?/ ??/ ?┖ @/ [/ ]/ ^/ ‘/ ’/ "/ "/ 〈/ 〉/ 《/ 》/ 【/ 】/ >/ ∶/ ■/ ●/ ·/ …/ !/ #/ %,/ %/ \'/ (/ )/ */ +/ ,/ -/ // np v n w m a x t q j ni ns d i f u p g nz c r id s k h o e / #?/ --/""" #用来停用词性标注
|
|
451
451
|
Special="""∶ ■ ● ① ② ③ × ℃ Ⅲ ④ ⑤ ◆ ⑥ ± ⑦ ⑧ → ⑨ ▲ ⑩ ─ ÷ μ γ β Ⅱ Ⅰ ‰ □ 〇 ○ Ⅴ Ⅳ ★ ﹐ ° ※ ︰ α ― ≠ █ о θ ω ⒈ ⒉ ⒊ н ≤ ì ǎ ≥ р т с к й а и Ⅵ é è ﹢ ﹝ ﹞ ā ⒋ ù π ◇ Ω Ф ы Я п К в у м ǒ ü á ǔ ⒌ ⒍ 䦆 Ⅹ Ⅶ ← """
|
|
452
|
-
ZhStopWords="""——— 》), )÷(1- ”, )、 =( : → ℃ & * 一一 ~~~~ ’ . 『 .一 ./ -- 』 =″ 【 [*] }> [⑤]] [①D] c] ng昉 * // [ ] [②e] [②g] ={ } ,也 ‘ A [①⑥] [②B] [①a] [④a] [①③] [③h] ③] 1. -- [②b] ’‘ ××× [①⑧] 0:2 =[ [⑤b] [②c] [④b] [②③] [③a] [④c] [①⑤] [①⑦] [①g] ∈[ [①⑨] [①④] [①c] [②f] [②⑧] [②①] [①C] [③c] [③g] [②⑤] [②②] 一. [①h] .数 [] [①B] 数/ [①i] [③e] [①①] [④d] [④e] [③b] [⑤a] [①A] [②⑧] [②⑦] [①d] [②j] 〕〔 ][ :// ′∈ [②④ [⑤e] 12% b] ... ................... …………………………………………………③ ZXFITL [③F] 」 [①o] ]∧′=[ ∪φ∈ ′| {- ②c } [③①] R.L. [①E] Ψ -[*]- ↑ .日 [②d] [② [②⑦] [②②] [③e] [①i] [①B] [①h] [①d] [①g] [①②] [②a] f] [⑩] a] [①e] [②h] [②⑥] [③d] [②⑩] e] 〉 】 元/吨 [②⑩] 2.3% 5:0 [①] :: [②] [③] [④] [⑤] [⑥] [⑦] [⑧] [⑨] …… —— ? 、 。 “ ” 《 》 ! , : ; ? . , . ' ? · ——— ── ? — < > ( ) 〔 〕 [ ] ( ) - + ~ × / / ① ② ③ ④ ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ Ⅲ В " ; # @ γ μ φ φ. × Δ ■ ▲ sub exp sup sub Lex # % & ' + +ξ ++ - -β < <± <Δ <λ <φ
|
|
452
|
+
ZhStopWords="""——— 》), )÷(1- ”, )、 =( : → ℃ & * 一一 ~~~~ ’ . 『 .一 ./ -- 』 =″ 【 [*] }> [⑤]] [①D] c] ng昉 * // [ ] [②e] [②g] ={ } ,也 ‘ A [①⑥] [②B] [①a] [④a] [①③] [③h] ③] 1. -- [②b] ’‘ ××× [①⑧] 0:2 =[ [⑤b] [②c] [④b] [②③] [③a] [④c] [①⑤] [①⑦] [①g] ∈[ [①⑨] [①④] [①c] [②f] [②⑧] [②①] [①C] [③c] [③g] [②⑤] [②②] 一. [①h] .数 [] [①B] 数/ [①i] [③e] [①①] [④d] [④e] [③b] [⑤a] [①A] [②⑧] [②⑦] [①d] [②j] 〕〔 ][ :// ′∈ [②④ [⑤e] 12% b] ... ................... …………………………………………………③ ZXFITL [③F] 」 [①o] ]∧′=[ ∪φ∈ ′| {- ②c } [③①] R.L. [①E] Ψ -[*]- ↑ .日 [②d] [② [②⑦] [②②] [③e] [①i] [①B] [①h] [①d] [①g] [①②] [②a] f] [⑩] a] [①e] [②h] [②⑥] [③d] [②⑩] e] 〉 】 元/吨 [②⑩] 2.3% 5:0 [①] :: [②] [③] [④] [⑤] [⑥] [⑦] [⑧] [⑨] …… —— ? 、 。 “ ” 《 》 ! , : ; ? . , . ' ? · ——— ── ? — < > ( ) 〔 〕 [ ] ( ) - + ~ × / / ① ② ③ ④ ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ Ⅲ В " ; # @ γ μ φ φ. × Δ ■ ▲ sub exp sup sub Lex # % & ' + +ξ ++ - -β < <± <Δ <λ <φ <<== =☆ =- > >λ _ ~± ~+ [⑤f] [⑤d] [②i] ≈ [②G] [①f] LI ㈧ [- ...... 〉 [③⑩] 第二 一番 一直 一个 一些 许多 种 有的是 也就是说 末##末 啊 阿 哎 哎呀 哎哟 唉 俺 俺们 按 按照 吧 吧哒 把 罢了 被 本 本着 比 比方 比如 鄙人 彼 彼此 边 别 别的 别说 并 并且 不比 不成 不单 不但 不独 不管 不光 不过 不仅 不拘 不论 不怕 不然 不如 不特 不惟 不问 不只 朝 朝着 趁 趁着 乘 冲 除 除此之外 除非 除了 此 此间 此外 从 从而 打 待 但 但是 当 当着 到 得 的 的话 等 等等 地 第 叮咚 对 对于 多 多少 而 而况 而且 而是 而外 而言 而已 尔后 反过来 反过来说 反之 非但 非徒 否则 嘎 嘎登 该 赶 个 各 各个 各位 各种 各自 给 根据 跟 故 故此 固然 关于 管 归 果然 果真 过 哈 哈哈 呵 和 何 何处 何况 何时 嘿 哼 哼唷 呼哧 乎 哗 还是 还有 换句话说 换言之 或 或是 或者 极了 及 及其 及至 即 即便 即或 即令 即若 即使 几 几时 己 既 既然 既是 继而 加之 假如 假若 假使 鉴于 将 较 较之 叫 接着 结果 借 紧接着 进而 尽 尽管 经 经过 就 就是 就是说 据 具体地说 具体说来 开始 开外 靠 咳 可 可见 可是 可以 况且 啦 来 来着 离 例如 哩 连 连同 两者 了 临 另 另外 另一方面 论 嘛 吗 慢说 漫说 冒 么 每 每当 们 莫若 某 某个 某些 拿 哪 哪边 哪儿 哪个 哪里 哪年 哪怕 哪天 哪些 哪样 那 那边 那儿 那个 那会儿 那里 那么 那么些 那么样 那时 那些 那样 乃 乃至 呢 能 你 你们 您 宁 宁可 宁肯 宁愿 哦 呕 啪达 旁人 呸 凭 凭借 其 其次 其二 其他 其它 其一 其余 其中 起 起见 起见 岂但 恰恰相反 前后 前者 且 然而 然后 然则 让 人家 任 任何 任凭 如 如此 如果 如何 如其 如若 如上所述 若 若非 若是 啥 上下 尚且 设若 设使 甚而 甚么 甚至 省得 时候 什么 什么样 使得 是 是的 首先 谁 谁知 顺 顺着 似的 虽 虽然 虽说 虽则 随 随着 所 所以 他 他们 他人 它 它们 她 她们 倘 倘或 倘然 倘若 倘使 腾 替 通过 同 同时 哇 万一 往 望 为 为何 为了 为什么 为着 喂 嗡嗡 我 我们 呜 呜呼 乌乎 无论 无宁 毋宁 嘻 吓 相对而言 像 向 向着 嘘 呀 焉 沿 沿着 要 要不 要不然 要不是 要么 要是 也 也罢 也好 一 一般 一旦 一方面 一来 一切 一样 一则 依 依照 矣 以 以便 以及 以免 以至 以至于 以致 抑或 因 因此 因而 因为 哟 用 由 由此可见 由于 有 有的 有关 有些 又 于 于是 于是乎 与 与此同时 与否 与其 越是 云云 哉 再说 再者 在 在下 咱 咱们 则 怎 怎么 怎么办 怎么样 怎样 咋 照 照着 者 这 这边 这儿 这个 这会儿 这就是说 这里 这么 这么点儿 这么些 这么样 这时 这些 这样 正如 吱 之 之类 之所以 之一 只是 只限 只要 只有 至 至于 诸位 着 着呢 自 自从 自个儿 自各儿 自己 自家 自身 综上所述 总的来看 总的来说 总的说来 总而言之 总之 纵 纵令 纵然 纵使 遵照 作为 兮 呃 呗 咚 咦 喏 啐 喔唷 嗬 嗯 嗳"""
|
|
453
453
|
EnPunctuation="""!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~"""
|
|
454
454
|
nltk_en_tags={'CC': '并列连词', 'CD': '基数词', 'DT': '限定符', 'EX': '存在词', 'FW': '外来词', 'IN': '介词或从属连词', 'JJ': '形容词', 'JJR': '比较级的形容词', 'JJS': '最高级的形容词', 'LS': '列表项标记', 'MD': '情态动词', 'NN': '名词单数', 'NNS': '名词复数', 'NNP': '专有名词', 'NNPS': '专有名词复数', 'PDT': '前置限定词', 'POS': '所有格结尾', 'PRP': '人称代词', 'PRP$': '所有格代词', 'RB': '副词', 'RBR': '副词比较级', 'RBS': '副词最高级', 'RP': '小品词', 'SYM': '符号', 'UH': '感叹词', 'VB': '动词原型', 'VBD': '动词过去式', 'VBG': '动名词或现在分词', 'VBN': '动词过去分词', 'VBP': '非第三人称单数的现在时', 'VBZ': '第三人称单数的现在时', 'WDT': '以wh开头的限定词', 'WP': '以wh开头的代词', 'WP$': '以wh开头的所有格代词', 'WRB': '以wh开头的副词', 'TO': 'to'}
|
|
455
455
|
nltk_tag_mapping={'NN': 'Noun', 'NNS': 'Noun', 'NNP': 'Noun', 'NNPS': 'Noun', 'VB': 'Verb', 'VBD': 'Verb', 'VBG': 'Verb', 'VBN': 'Verb', 'VBP': 'Verb', 'VBZ': 'Verb', 'JJ': 'Adjective', 'JJR': 'Adjective', 'JJS': 'Adjective', 'RB': 'Adverb', 'RBR': 'Adverb', 'RBS': 'Adverb', 'IN': 'Preposition', 'PRP': 'Pronoun', 'PRP$': 'Pronoun', 'DT': 'Determiner', 'CC': 'Conjunction', 'CD': 'Numeral', 'UH': 'Interjection', 'FW': 'Foreign Word', 'TO': 'Particle', 'EX': 'Existential "there"', 'MD': 'Modal Auxiliary', 'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb', 'SYM': 'Symbol', 'RP': 'Particle', 'POS': 'Possessive ending', 'PDT': 'Predeterminer', 'LS': 'List item marker', 'NIL': 'Missing tag'}
|
|
@@ -457,6 +457,8 @@ nltk_tag_mapping={'NN': 'Noun', 'NNS': 'Noun', 'NNP': 'Noun', 'NNPS': 'Noun', 'V
|
|
|
457
457
|
ICTCLAS2008={'a': '形容词', 'ad': '副形词', 'ag': '形容词性语素', 'al': '形容词性惯用语', 'an': '名形词', 'b': '区别词', 'bl': '区别词性惯用语', 'c': '连词', 'cc': '并列连词', 'd': '副词', 'dg': '副词性语素', 'dl': '副词性惯用语', 'e': '叹词', 'ew': '句末标点', 'f': '方位词', 'h': '前缀', 'k': '后缀', 'm': '数词', 'mg': '数词性语素', 'mq': '数量词', 'n': '名词', 'ng': '名词性语素', 'nl': '名词性惯用语', 'nr': '汉语人名', 'nr1': '汉语姓氏', 'nr2': '汉语名字', 'nrf': '音译人名', 'nrj': '日语人名', 'ns': '地名', 'nsf': '音译地名', 'nt': '机构团体名', 'nz': '其他专名', 'o': '拟声词', 'p': '介词', 'pba': '介词“把”', 'pbei': '介词“被”', 'q': '量词', 'qt': '时量词', 'qv': '动量词', 'r': '代词', 'rg': '代词性语素', 'rr': '人称代词', 'ry': '疑问代词', 'rys': '处所疑问代词', 'ryt': '时间疑问代词', 'ryv': '谓词性疑问代词', 'rz': '指示代词', 'rzs': '处所指示代词', 'rzt': '时间指示代词', 'rzv': '谓词性指示代词', 's': '处所词', 't': '时间词', 'tg': '时间词性语素', 'u': '助词', 'udel': '的、底', 'ude2': '地', 'ude3': '得', 'udeng': '等、等等、云云', 'udh': '......的话', 'uguo': '过', 'ule': '了', 'ulian': '连', 'uls': '来讲、来说;而言、说来', 'usuo': '所', 'uyy': '一样、一般;似的、般', 'uzhe': '着', 'uzhi': '之', 'v': '动词', 'vd': '副动词', 'vf': '趋向动词', 'vg': '动词性语素', 'vi': '不及物动词', 'vl': '动词性惯用语', 'vn': '名动词', 'vshi': '动词“是”', 'vx': '形式动词', 'vyou': '动词“有”', 'w': '标点符号', 'wd': '逗号', 'wky': '右括号', 'wkz': '左括号', 'wm': '冒号', 'wn': '顿号', 'wp': '破折号', 'ws': '省略号', 'wy': '引号', 'x': '字符串', 'y': '语气词', 'z': '状态词'}
|
|
458
458
|
thulac_tags={'n': '名词', 'np': '人名', 'ns': '地名', 'ni': '机构名', 'nz': '其它专名', 'm': '数词', 'q': '量词', 'mq': '数量词', 't': '时间词', 'f': '方位词', 's': '处所词', 'v': '动词', 'a': '形容词', 'd': '副词', 'h': '前接成分', 'k': '后接成分', 'i': '习语', 'j': '简称', 'r': '代词', 'c': '连词', 'p': '介词', 'u': '助词', 'y': '语气助词', 'e': '叹词', 'o': '拟声词', 'g': '语素', 'w': '标点', 'x': '其它'}
|
|
459
459
|
|
|
460
|
+
LangCodes={'AA': ['阿法尔语', 'Afar'], 'AB': ['阿布哈兹语', 'Abkhaz'], 'AE': ['阿维斯陀语', 'Avestan'], 'AF': ['阿非利堪斯语', 'Afrikaans'], 'AK': ['阿坎语', 'Akan, Twi-Fante'], 'AM': ['阿姆哈拉语', 'Amharic'], 'AN': ['阿拉贡语', 'Aragonese'], 'AR': ['阿拉伯语', 'Arabic'], 'AS': ['阿萨姆语', 'Assamese'], 'AV': ['阿瓦尔语', 'Avaric'], 'AY': ['艾马拉语', 'Aymara'], 'AZ': ['阿塞拜疆语', 'Azerbaijani'], 'BA': ['巴什基尔语', 'Bashkir'], 'BE': ['白俄罗斯语', 'Belarusian'], 'BG': ['保加利亚语', 'Bulgarian'], 'BH': ['比哈尔语', 'Bihari'], 'BI': ['比斯拉玛语', 'Bislama'], 'BM': ['班巴拉语', 'Bambara'], 'BN': ['孟加拉语', 'Bengali'], 'BO': ['藏语', 'Tibetan Standard, Central Tibetan'], 'BR': ['布列塔尼语', 'Breton'], 'BS': ['波斯尼亚语', 'Bosnian'], 'CA': ['加泰隆语', 'Catalan;\xa0Valencian'], 'CE': ['车臣语', 'Chechen'], 'CH': ['查莫罗语', 'Chamorro'], 'CO': ['科西嘉语', 'Corsican'], 'CR': ['克里语', 'Cree'], 'CS': ['捷克语', 'Czech'], 'CU': ['教会斯拉夫语', 'Old Church Slavonic, Church Slavic, Church Slavonic, Old Bulgarian, Old Slavonic'], 'CV': ['楚瓦什语', 'Chuvash'], 'CY': ['威尔士语', 'Welsh'], 'DA': ['丹麦语', 'Danish'], 'DE': ['德语', 'German'], 'DV': ['迪维希语', 'Divehi; Dhivehi; Maldivian;'], 'DZ': ['不丹语', 'Dzongkha'], 'EE': ['埃维语', 'Ewe'], 'EL': ['现代希腊语', 'Greek, Modern'], 'EN': ['英语', 'English'], 'EO': ['世界语', 'Esperanto'], 'ES': ['西班牙语', 'Spanish; Castilian'], 'ET': ['爱沙尼亚语', 'Estonian'], 'EU': ['巴斯克语', 'Basque'], 'FA': ['波斯语', 'Persian'], 'FF': ['富拉语', 'Fula; Fulah; Pulaar; Pular'], 'FI': ['芬兰语', 'Finnish'], 'FJ': ['斐济语', 'Fijian'], 'FO': ['法罗斯语', 'Faroese'], 'FR': ['法语', 'French'], 'FY': ['弗里西亚语', 'Western Frisian'], 'GA': ['爱尔兰语', 'Irish'], 'GD': ['盖尔语(苏格兰语)', 'Scottish Gaelic; Gaelic'], 'GL': ['加利西亚语', 'Galician'], 'GN': ['瓜拉尼语', 'Guaraní'], 'GU': ['古吉拉特语', 'Gujarati'], 'GV': ['马恩岛语', 'Manx'], 'HA': ['豪萨语', 'Hausa'], 'HE': ['希伯来语', 'Hebrew\xa0(modern)'], 'HI': ['印地语', 'Hindi'], 'HO': ['希里莫图语', 'Hiri Motu'], 'HR': ['克罗地亚语', 'Croatian'], 'HT': ['海地克里奥尔语', 'Haitian; Haitian Creole'], 'HU': ['匈牙利语', 'Hungarian'], 'HY': ['亚美尼亚语', 'Armenian'], 'HZ': ['赫雷罗语', 'Herero'], 'I.E.': ['国际语E', 'Interlingue'], 'IA': ['国际语A', 'Interlingua'], 'ID': ['印尼语', 'Indonesian'], 'IG': ['伊博语', 'Igbo'], 'II': ['四川彝语(诺苏语)', 'Nuosu'], 'IK': ['依努庇克语', 'Inupiaq'], 'IO': ['伊多语', 'Ido'], 'IS': ['冰岛语', 'Icelandic'], 'IT': ['意大利语', 'Italian'], 'IU': ['伊努伊特语', 'Inuktitut'], 'JA': ['日语', 'Japanese'], 'JV': ['爪哇语', 'Javanese'], 'KA': ['格鲁吉亚语', 'Georgian'], 'KG': ['刚果语', 'Kongo'], 'KI': ['基库尤语', 'Kikuyu, Gikuyu'], 'KJ': ['夸尼亚玛语', 'Kwanyama, Kuanyama'], 'KK': ['哈萨克语', 'Kazakh'], 'KL': ['格陵兰语', 'Kalaallisut, Greenlandic'], 'KM': ['高棉语', 'Khmer, Cambodian'], 'KN': ['坎纳达语', 'Kannada'], 'KO': ['朝鲜语', 'Korean'], 'KR': ['卡努里语', 'Kanuri'], 'KS': ['克什米尔语', 'Kashmiri'], 'KU': ['库尔德语', 'Kurdish'], 'KV': ['科米语', 'Komi'], 'KW': ['康沃尔语', 'Cornish'], 'KY': ['吉尔吉斯语', 'Kirghiz, Kyrgyz'], 'LA': ['拉丁语', 'Latin'], 'LB': ['卢森堡语', 'Luxembourgish, Letzeburgesch'], 'LG': ['干达语', 'Luganda'], 'LI': ['林堡语', 'Limburgish, Limburgan, Limburger'], 'LN': ['林加拉语', 'Lingala'], 'LO': ['老挝语', 'Lao'], 'LT': ['立陶宛语', 'Lithuanian'], 'LU': ['卢巴—加丹加语', 'Luba-Katanga'], 'LV': ['拉脱维亚语', 'Latvian'], 'MG': ['马达加斯加语', 'Malagasy'], 'MH': ['马绍尔语', 'Marshallese'], 'MI': ['毛利语', 'Māori'], 'MK': ['马其顿语', 'Macedonian'], 'ML': ['马拉亚拉姆语', 'Malayalam'], 'MN': ['蒙古语', 'Mongolian'], 'MR': ['马拉提语', 'Marathi (Marāṭhī)'], 'MS': ['马来语', 'Malay'], 'MT': ['马耳他语', 'Maltese'], 'MY': ['缅甸语', 'Burmese'], 'NA': ['瑙鲁语', 'Nauru'], 'NB': ['挪威布克摩尔语', 'Norwegian Bokmål'], 'ND': ['北恩德贝勒语', 'North Ndebele'], 'NE': ['尼泊尔语', 'Nepali'], 'NG': ['恩敦加语', 'Ndonga'], 'NL': ['荷兰语', 'Dutch'], 'NN': ['尼诺斯克挪威语', 'Norwegian Nynorsk'], 'NO': ['挪威语', 'Norwegian'], 'NR': ['南恩德贝勒语', 'South Ndebele'], 'NV': ['纳瓦霍语', 'Navajo, Navaho'], 'NY': ['尼扬贾语', 'Chichewa; Chewa; Nyanja'], 'OC': ['普罗旺斯语', 'Occitan'], 'OJ': ['奥吉布瓦语', 'Ojibwe, Ojibwa'], 'OM': ['阿芳•奥洛莫语', 'Oromo'], 'OR': ['奥利亚语', 'Oriya'], 'OS': ['奥塞梯语', 'Ossetian, Ossetic'], 'PA': ['旁遮普语', 'Panjabi, Punjabi'], 'PI': ['巴利语', 'Pāli'], 'PL': ['波兰语', 'Polish'], 'PS': ['普什图语', 'Pashto, Pushto'], 'PT': ['葡萄牙语', 'Portuguese'], 'QU': ['凯楚亚语', 'Quechua'], 'RM': ['罗曼语', 'Romansh'], 'RN': ['基隆迪语', 'Kirundi'], 'RO': ['罗马尼亚语', 'Romanian,\xa0Moldavian, Moldovan'], 'RU': ['俄语', 'Russian'], 'RW': ['基尼阿万达语', 'Kinyarwanda'], 'SA': ['梵语', 'Sanskrit (Saṁskṛta)'], 'SC': ['撒丁语', 'Sardinian'], 'SD': ['信德语', 'Sindhi'], 'SE': ['北萨摩斯语', 'Northern Sami'], 'SG': ['桑戈语', 'Sango'], 'SI': ['僧加罗语', 'Sinhala, Sinhalese'], 'SK': ['斯洛伐克语', 'Slovak'], 'SL': ['斯洛文尼亚语', 'Slovene'], 'SM': ['萨摩亚语', 'Samoan'], 'SN': ['绍纳语', 'Shona'], 'SO': ['索马里语', 'Somali'], 'SQ': ['阿尔巴尼亚语', 'Albanian'], 'SR': ['塞尔维亚语', 'Serbian'], 'SS': ['塞斯瓦特语', 'Swati'], 'ST': ['南索托语', 'Southern Sotho'], 'SU': ['巽他语', 'Sundanese'], 'SV': ['瑞典语', 'Swedish'], 'SW': ['斯瓦希里语', 'Swahili'], 'TA': ['泰米尔语', 'Tamil'], 'TE': ['泰卢固语', 'Telugu'], 'TG': ['塔吉克语', 'Tajik'], 'TH': ['泰语', 'Thai'], 'TI': ['提格里尼亚语', 'Tigrinya'], 'TK': ['土库曼语', 'Turkmen'], 'TL': ['他加禄语', 'Tagalog'], 'TN': ['塞茨瓦纳语', 'Tswana'], 'TO': ['汤加语', 'Tongan'], 'TR': ['土耳其语', 'Turkish'], 'TS': ['宗加语', 'Tsonga'], 'TT': ['塔塔尔语', 'Tatar'], 'TW': ['特威语', 'Twi'], 'TY': ['塔希提语', 'Tahitian'], 'UG': ['维吾尔语', 'Uighur, Uyghur'], 'UK': ['乌克兰语', 'Ukrainian'], 'UR': ['乌尔都语', 'Urdu'], 'UZ': ['乌兹别克语', 'Uzbek'], 'VE': ['文达语', 'Venda'], 'VI': ['越南语', 'Vietnamese'], 'VO': ['沃拉普克语', 'Volapük'], 'WA': ['瓦隆语', 'Walloon'], 'WO': ['沃洛夫语', 'Wolof'], 'XH': ['科萨语', 'Xhosa'], 'YI': ['依地语', 'Yiddish'], 'YO': ['约鲁巴语', 'Yoruba'], 'ZA': ['壮语', 'Zhuang, Chuang'], 'ZH': ['汉语(中文)', 'Chinese'], 'ZU': ['祖鲁语', 'Zulu']}
|
|
461
|
+
|
|
460
462
|
def word_list(split_words):
|
|
461
463
|
"""
|
|
462
464
|
Parameters
|
|
@@ -482,13 +484,8 @@ def batch_word_list(input_root):
|
|
|
482
484
|
----------
|
|
483
485
|
input_root : TYPE string
|
|
484
486
|
DESCRIPTION.
|
|
485
|
-
It's a folder path like
|
|
486
|
-
|
|
487
|
-
For example, the text of D:\seg_only\1.txt should be like:
|
|
488
|
-
PgsFile is Python library to facilitate Python beginners ,
|
|
489
|
-
especially instructors and students of foreign languages and literature,
|
|
490
|
-
for the convenience of easily operating txt ,
|
|
491
|
-
xlsx and json files as well as making word list .
|
|
487
|
+
It's a folder path like seg_only.
|
|
488
|
+
Based on tokenized text.
|
|
492
489
|
|
|
493
490
|
Returns
|
|
494
491
|
-------
|
|
@@ -505,7 +502,7 @@ def batch_word_list(input_root):
|
|
|
505
502
|
('literature', [1, 1]),]
|
|
506
503
|
'''
|
|
507
504
|
from PgsFile import get_data_text as gt, FilePath as fp, BigPunctuation as bp
|
|
508
|
-
# input_root=r"
|
|
505
|
+
# input_root=r"047_Scraping\seg_only"
|
|
509
506
|
file_names=fp(input_root)
|
|
510
507
|
|
|
511
508
|
from collections import defaultdict
|
|
@@ -749,11 +746,10 @@ def cs(para):
|
|
|
749
746
|
# import zhon
|
|
750
747
|
# rst=re.findall(zhon.hanzi.sentence, para)
|
|
751
748
|
# return rst #['我买了一辆车。', '妈妈做的菜,很好吃!']
|
|
752
|
-
|
|
753
|
-
para=re.sub('(
|
|
754
|
-
para=re.sub('(
|
|
755
|
-
para=re.sub('(
|
|
756
|
-
para=re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
|
|
749
|
+
para=re.sub(r'([。!?\?])([^”’])', r"\1\n\2", para) # 单字符断句符
|
|
750
|
+
para=re.sub(r'(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
|
|
751
|
+
para=re.sub(r'(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
|
|
752
|
+
para=re.sub(r'([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
|
|
757
753
|
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
|
|
758
754
|
para=para.rstrip() # 段尾如果有多余的\n就去掉它
|
|
759
755
|
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
|
|
@@ -784,7 +780,7 @@ def cs2(text):
|
|
|
784
780
|
alphabets="([A-Za-z])"
|
|
785
781
|
prefixes="(Mr|St|Mrs|Ms|Dr)[.]"
|
|
786
782
|
suffixes="(Inc|Ltd|Jr|Sr|Co)"
|
|
787
|
-
starters="(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
|
783
|
+
starters=r"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
|
788
784
|
acronyms="([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
|
789
785
|
websites="[.](com|net|org|io|gov)"
|
|
790
786
|
digits="([0-9])"
|
|
@@ -796,7 +792,7 @@ def cs2(text):
|
|
|
796
792
|
text=re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
|
|
797
793
|
if "..." in text: text=text.replace("...","<prd><prd><prd>")
|
|
798
794
|
if "Ph.D" in text: text=text.replace("Ph.D.","Ph<prd>D<prd>")
|
|
799
|
-
text=re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
|
|
795
|
+
text=re.sub(r"\s" + alphabets + "[.] "," \\1<prd> ",text)
|
|
800
796
|
text=re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
|
|
801
797
|
text=re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
|
|
802
798
|
text=re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
|
|
@@ -1050,7 +1046,7 @@ def replace_english_punctuation_with_chinese(text):
|
|
|
1050
1046
|
# 定义英文标点和对应的中文标点的映射关系
|
|
1051
1047
|
punctuation_mapping={
|
|
1052
1048
|
',': ',',
|
|
1053
|
-
'.': '。',
|
|
1049
|
+
# '.': '。', # 去掉!
|
|
1054
1050
|
'?': '?',
|
|
1055
1051
|
'!': '!',
|
|
1056
1052
|
';': ';',
|
|
@@ -1073,8 +1069,8 @@ def extract_misspelled_words_from_docx(file_path, mode=None):
|
|
|
1073
1069
|
Parameters
|
|
1074
1070
|
----------
|
|
1075
1071
|
file_path : TYPE string
|
|
1076
|
-
DESCRIPTION. r"
|
|
1077
|
-
mode : TYPE, optional
|
|
1072
|
+
DESCRIPTION. r"DocsMetrics for Translation Quality Assessment_A Case for Standardising Error Typologies.docx"
|
|
1073
|
+
mode : TYPE, optional string
|
|
1078
1074
|
DESCRIPTION.
|
|
1079
1075
|
1. The default is None, which means extracting all words with double underlines and wavy lines.
|
|
1080
1076
|
2. The "spell" mode means extracting all words with wavy red lines.
|
|
@@ -1146,13 +1142,13 @@ def get_text_length_kb(text: str) -> str:
|
|
|
1146
1142
|
Get the length of a text string in KB (kilobytes, eg.26.5 KB).
|
|
1147
1143
|
"""
|
|
1148
1144
|
# Get the length of the text in bytes
|
|
1149
|
-
text_bytes
|
|
1145
|
+
text_bytes=len(text.encode('utf-8'))
|
|
1150
1146
|
|
|
1151
1147
|
# Convert the length to KB
|
|
1152
|
-
text_kb
|
|
1153
|
-
rounded_num
|
|
1148
|
+
text_kb=text_bytes / 1024
|
|
1149
|
+
rounded_num=round(text_kb, 2)
|
|
1154
1150
|
|
|
1155
|
-
text_kb
|
|
1151
|
+
text_kb=f'{rounded_num} KB'
|
|
1156
1152
|
print(type(text_kb))
|
|
1157
1153
|
|
|
1158
1154
|
return text_kb
|
|
@@ -1179,8 +1175,8 @@ def generate_password(length: int) -> str:
|
|
|
1179
1175
|
"""
|
|
1180
1176
|
import random
|
|
1181
1177
|
# Define the set of characters to choose from
|
|
1182
|
-
character_set
|
|
1183
|
-
random_password
|
|
1178
|
+
character_set="1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+=-"
|
|
1179
|
+
random_password=''.join(random.choice(character_set) for _ in range(length))
|
|
1184
1180
|
|
|
1185
1181
|
return random_password
|
|
1186
1182
|
|
|
@@ -1196,14 +1192,14 @@ def extract_numbers(string: str) -> list:
|
|
|
1196
1192
|
"""
|
|
1197
1193
|
import re
|
|
1198
1194
|
# Define a regular expression to match one or more digits
|
|
1199
|
-
digit_pattern
|
|
1195
|
+
digit_pattern=re.compile(r'(\d+)')
|
|
1200
1196
|
|
|
1201
1197
|
# Split the input string using the regular expression
|
|
1202
|
-
fragments
|
|
1198
|
+
fragments=digit_pattern.split(string)
|
|
1203
1199
|
|
|
1204
1200
|
# Convert every other fragment to an integer (the ones that match the digit pattern)
|
|
1205
1201
|
for i in range(1, len(fragments), 2):
|
|
1206
|
-
fragments[i]
|
|
1202
|
+
fragments[i]=int(fragments[i])
|
|
1207
1203
|
|
|
1208
1204
|
return fragments
|
|
1209
1205
|
|
|
@@ -1220,7 +1216,7 @@ def sort_strings_with_embedded_numbers(strings: list) -> list:
|
|
|
1220
1216
|
list: A new list containing the sorted strings.
|
|
1221
1217
|
"""
|
|
1222
1218
|
# Sort the strings using the extract_numbers() function as the key
|
|
1223
|
-
sorted_strings
|
|
1219
|
+
sorted_strings=sorted(strings, key=extract_numbers)
|
|
1224
1220
|
|
|
1225
1221
|
return sorted_strings
|
|
1226
1222
|
|
|
@@ -1237,10 +1233,10 @@ def run_command(command: str) -> str:
|
|
|
1237
1233
|
"""
|
|
1238
1234
|
import subprocess
|
|
1239
1235
|
# Run the command and capture the output
|
|
1240
|
-
output
|
|
1236
|
+
output=subprocess.check_output(command, shell=True)
|
|
1241
1237
|
|
|
1242
1238
|
# Decode the output from bytes to string
|
|
1243
|
-
output_str
|
|
1239
|
+
output_str=output.decode()
|
|
1244
1240
|
|
|
1245
1241
|
return output_str
|
|
1246
1242
|
|
|
@@ -1248,6 +1244,7 @@ def run_command(command: str) -> str:
|
|
|
1248
1244
|
import random
|
|
1249
1245
|
import requests
|
|
1250
1246
|
from lxml import html, etree
|
|
1247
|
+
import pandas as pd
|
|
1251
1248
|
my_headers={"User-Agent": random.choice(yhd)}
|
|
1252
1249
|
class PGScraper(object):
|
|
1253
1250
|
def __init__(self):
|
|
@@ -1260,31 +1257,27 @@ class PGScraper(object):
|
|
|
1260
1257
|
valid_xpath=[]
|
|
1261
1258
|
valid_span=[]
|
|
1262
1259
|
# Example HTML content
|
|
1263
|
-
if headers is None:
|
|
1264
|
-
real_headers=my_headers
|
|
1265
|
-
else:
|
|
1266
|
-
real_headers=headers
|
|
1267
1260
|
if timeout is None:
|
|
1268
1261
|
real_timeout=24.0
|
|
1269
1262
|
else:
|
|
1270
1263
|
real_timeout=timeout
|
|
1271
1264
|
|
|
1272
|
-
r=requests.get(url,timeout=real_timeout,headers=
|
|
1265
|
+
r=requests.get(url,timeout=real_timeout,headers=headers, cookies=cookies, params=params, proxies=proxies)
|
|
1273
1266
|
if r.status_code==200:
|
|
1274
1267
|
r.encoding="utf-8"
|
|
1275
1268
|
html_content=r.content
|
|
1276
1269
|
# Parse HTML content
|
|
1277
|
-
tree
|
|
1270
|
+
tree=html.fromstring(html_content)
|
|
1278
1271
|
relative_xpaths=[]
|
|
1279
1272
|
for text in want_list:
|
|
1280
1273
|
# Find elements containing the text
|
|
1281
|
-
elements
|
|
1274
|
+
elements=tree.xpath(f"//*[contains(text(), '{text}')]")
|
|
1282
1275
|
if not elements:
|
|
1283
1276
|
return None
|
|
1284
1277
|
|
|
1285
1278
|
# Assume we want the first matching element
|
|
1286
|
-
element
|
|
1287
|
-
absolute_xpath
|
|
1279
|
+
element=elements[0]
|
|
1280
|
+
absolute_xpath=tree.getroottree().getpath(element)
|
|
1288
1281
|
relative_xpaths.append(absolute_xpath)
|
|
1289
1282
|
|
|
1290
1283
|
path1=relative_xpaths[0]
|
|
@@ -1312,7 +1305,7 @@ class PGScraper(object):
|
|
|
1312
1305
|
all_want_list.append(clean_list(target_eles))
|
|
1313
1306
|
valid_xpath.append(my_path)
|
|
1314
1307
|
except:
|
|
1315
|
-
error_type, value, traceback
|
|
1308
|
+
error_type, value, traceback=sys.exc_info()
|
|
1316
1309
|
error_info=f'{error_type}\n{value}\n{traceback}'
|
|
1317
1310
|
print(error_info)
|
|
1318
1311
|
|
|
@@ -1345,7 +1338,7 @@ class PGScraper(object):
|
|
|
1345
1338
|
all_want_list.append((clean_list(target_eles),clean_list(target_url_eles)))
|
|
1346
1339
|
valid_xpath.append((my_path,my_path_url))
|
|
1347
1340
|
except:
|
|
1348
|
-
error_type, value, traceback
|
|
1341
|
+
error_type, value, traceback=sys.exc_info()
|
|
1349
1342
|
error_info=f'{error_type}\n{value}\n{traceback}'
|
|
1350
1343
|
print(error_info)
|
|
1351
1344
|
|
|
@@ -1378,21 +1371,17 @@ class PGScraper(object):
|
|
|
1378
1371
|
def get_similar_text(self, url, timeout=None, headers=None, cookies=None, params=None, proxies=None):
|
|
1379
1372
|
all_want_list=[]
|
|
1380
1373
|
# Example HTML content
|
|
1381
|
-
if headers is None:
|
|
1382
|
-
real_headers=my_headers
|
|
1383
|
-
else:
|
|
1384
|
-
real_headers=headers
|
|
1385
1374
|
if timeout is None:
|
|
1386
1375
|
real_timeout=24.0
|
|
1387
1376
|
else:
|
|
1388
1377
|
real_timeout=timeout
|
|
1389
1378
|
|
|
1390
|
-
r=requests.get(url,timeout=real_timeout,headers=
|
|
1379
|
+
r=requests.get(url, timeout=real_timeout, headers=headers, cookies=cookies, params=params, proxies=proxies)
|
|
1391
1380
|
if r.status_code==200:
|
|
1392
1381
|
r.encoding="utf-8"
|
|
1393
1382
|
html_content=r.content
|
|
1394
1383
|
# Parse HTML content
|
|
1395
|
-
tree
|
|
1384
|
+
tree=html.fromstring(html_content)
|
|
1396
1385
|
if self.show_url==True:
|
|
1397
1386
|
for pat,url in self.pattern:
|
|
1398
1387
|
target_eles=tree.xpath(pat)
|
|
@@ -1406,3 +1395,337 @@ class PGScraper(object):
|
|
|
1406
1395
|
else:
|
|
1407
1396
|
print(r.status_code,"invalid url",url)
|
|
1408
1397
|
return all_want_list
|
|
1398
|
+
|
|
1399
|
+
|
|
1400
|
+
|
|
1401
|
+
|
|
1402
|
+
# -*- coding: utf-8 -*-
|
|
1403
|
+
"""
|
|
1404
|
+
Created on Thu Sep 17 16:11:45 2020
|
|
1405
|
+
Showing download progress and speed when audio-visual files like MP4, MP3, JPG etc are downloading!
|
|
1406
|
+
@author: Petercusin
|
|
1407
|
+
"""
|
|
1408
|
+
|
|
1409
|
+
import time
|
|
1410
|
+
from contextlib import closing
|
|
1411
|
+
|
|
1412
|
+
def audiovisual_downloader(url, path):
|
|
1413
|
+
with closing(requests.get(url, stream=True, headers=my_headers)) as r:
|
|
1414
|
+
chunk_size=1024*10
|
|
1415
|
+
content_size=int(r.headers['content-length'])
|
|
1416
|
+
print('Initiating download...')
|
|
1417
|
+
with open(path, "wb") as f:
|
|
1418
|
+
p=ProgressData(size=content_size, unit='Kb', block=chunk_size)
|
|
1419
|
+
for chunk in r.iter_content(chunk_size=chunk_size):
|
|
1420
|
+
f.write(chunk)
|
|
1421
|
+
p.output()
|
|
1422
|
+
|
|
1423
|
+
class ProgressData(object):
|
|
1424
|
+
def __init__(self, block, size, unit, file_name='', ):
|
|
1425
|
+
self.file_name=file_name
|
|
1426
|
+
self.block=block/1000.0
|
|
1427
|
+
self.size=size/1000.0
|
|
1428
|
+
self.unit=unit
|
|
1429
|
+
self.count=0
|
|
1430
|
+
self.start=time.time()
|
|
1431
|
+
def output(self):
|
|
1432
|
+
self.end=time.time()
|
|
1433
|
+
self.count += 1
|
|
1434
|
+
speed=self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
|
|
1435
|
+
self.start=time.time()
|
|
1436
|
+
loaded=self.count*self.block
|
|
1437
|
+
progress=round(loaded/self.size, 4)
|
|
1438
|
+
if loaded >= self.size:
|
|
1439
|
+
print(u'%sYour download has finished successfully.\r\n'%self.file_name)
|
|
1440
|
+
else:
|
|
1441
|
+
print(u'{0}Download Progress: {1:.2f}{2}/{3:.2f}{4} {5:.2%} Download Speed: {6:.2f}{7}/s'.\
|
|
1442
|
+
format(self.file_name, loaded, self.unit,\
|
|
1443
|
+
self.size, self.unit, progress, speed, self.unit))
|
|
1444
|
+
print('%50s'%('/'*int((1-progress)*50)))
|
|
1445
|
+
|
|
1446
|
+
|
|
1447
|
+
def levenshtein_distance(s, t):
|
|
1448
|
+
m, n=len(s), len(t)
|
|
1449
|
+
if m < n:
|
|
1450
|
+
s, t=t, s
|
|
1451
|
+
m, n=n, m
|
|
1452
|
+
d=[list(range(n + 1))] + [[i] + [0] * n for i in range(1, m + 1)]
|
|
1453
|
+
for j in range(1, n + 1):
|
|
1454
|
+
for i in range(1, m + 1):
|
|
1455
|
+
if s[i - 1]==t[j - 1]:
|
|
1456
|
+
d[i][j]=d[i - 1][j - 1]
|
|
1457
|
+
else:
|
|
1458
|
+
d[i][j]=min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1
|
|
1459
|
+
return d[m][n]
|
|
1460
|
+
|
|
1461
|
+
def compute_similarity(input_string, reference_string):
|
|
1462
|
+
distance=levenshtein_distance(input_string, reference_string)
|
|
1463
|
+
max_length=max(len(input_string), len(reference_string))
|
|
1464
|
+
similarity=1 - (distance / max_length)
|
|
1465
|
+
return similarity
|
|
1466
|
+
|
|
1467
|
+
pgs_abbres_words=['A.B.','A.D.','A.G.','A.I.','A.M.','A.P.','A.V.','AFP.','Ala.','Alta.','Apr.','Ariz.','Ark.','Assn.','Aug.','Ave.','B.A.','B.C','B.C.','B.Ed.','B.I.G','B.R.','B.S.','Blvd.','Brig.','Brig.-Gen.','Bros.','C.D.','C.E.O','C.I.A.','C.M.','C.V.','Calif.','Capt.','Cf.','Ch.','Cie.','Cir.','Cllr.','Cmdr.','Co.','Co.Design','Col.','Colo.','Conn.','Corp.','Cos.','Coun.','Cpl.','Cres.','D.C.','D.D.S.','D.J.','D.K.','D.S.','Dec.','Del.','Dept.','Det.','Dr.','E.B.','E.C.','E.ON','E.U.','E.coli','E.g.','Ed.','Esq.','F.C.','Feb.','Fig.','Fla.','Fri.','G.K.','G.M.','G.Skill','Ga.','Gen.','Gov.','Govt.','H.E.','H.L.','H.S.','Hon.','Hwy.','I.T.','I.e.','Ill.','Inc.','Ind.','J.Crew','J.D.','J.G.','J.P','J.R.R.','Jan.','Jr.','Jul.','Jun.','K.C.','K.J.','K.M.','K.N.','K.P.','K.R.','Kan.','Ky.','L.A.','L.L.','L.S.','LLC.','La.','Lieut.','Lt.','Lt.-Cmdr.','Lt.-Col.','Lt.-Gen.','Ltd.','M.A.','M.B.','M.B.A.','M.D.','M.E.N','M.I.A.','M.J.','M.M.','M.P.','M.S.','Maj.','Maj.-Gen.','Man.','Mar.','Mass.','Md.','Messrs.','Mfg.','Mfrs.','Mich.','Minn.','Miss.','Mmes.','Mo.','Mon.','Mr.','Mrs.','Ms.','Msgr.','Mss.','N.A.','N.B.','N.C.','N.D.','N.H.','N.J.','N.L.','N.M.','N.S.','N.W.A.','N.W.T.','N.Y.','Neb.','Nev.','No.','Nos.','Nov.','O.C.','O.K.','O.S.','Oct.','Okla.','Ont.','Op.','Ore.','P.C.','P.E.','P.E.I.','P.K.','P.M.','P.O.','P.R.','P.S.','Pa.','Ph.D','Ph.D.','Plc.','Pres.','Prof.','Psy.D.','Pte.','Que.','R.E.M.','R.I.','R.I.P.','R.M','R.R.','Rd.','Rep.','Rev.','Rs.','Rt.','S.A.','S.C.','S.D.','S.F.','S.H.I.E.L.D.','S.K.','S.League','S.M.','S.P.','Sask.','Sat.','Sec.','Sen.','Sep.','Sgt.','Sr.','St.','Ste.','Sub-Lieut.','Sun.','Supt.','T.A.','T.R.','T.V.','TV.','Tenn.','Tex.','Thu.','Tue.','Twp.','U.A.E.','U.K.','U.N','U.P.','U.S','U.S.','U.S.A.','U.S.C.','UK.','US.','V.P.','Va.','Vol.','Vt.','W.H.O.','W.Va.','Wash.','Wed.','Wis.','Y.T.','a.m.','abr.','anon.','bk.','bks.','bull.','c.','ca.','cf.','ch.','def.','e.g.','ed.','eds.','et al.','etc.','fig.','ft.','fwd.','gal.','i.e.','ibid.','illus.','in.','jour.','lb.','mag.','mi.','ms.','mss.','no.','oz.','p.','p.m.','pg.','pgs.','pp.','pseud.','pt.','pts.','pub.','qt.','qtd.','ser.','supp.','trans.','viz.','vol.','vols.','vs.','yd.']
|
|
1468
|
+
|
|
1469
|
+
def clean_text(text): #清洗除了句号以外的其他标点符号问题
|
|
1470
|
+
# 在标点符号右边邻接单词前添加空格
|
|
1471
|
+
import re
|
|
1472
|
+
text=replace_chinese_punctuation_with_english(text)
|
|
1473
|
+
text=re.sub(r'(?<=[\?\!\,\;\:\)\]\}])\s*(?=\w)', ' ', text)
|
|
1474
|
+
# 删除标点符号与左边单词之间的空格
|
|
1475
|
+
text=re.sub(r'\s*([\?\!\,\;\:\)\]\}\>])', r'\1', text)
|
|
1476
|
+
# 删除标点符号与右边单词之间的空格
|
|
1477
|
+
text=re.sub(r'\s*\(\s*', r' (', text)
|
|
1478
|
+
text=re.sub(r'\s*\[\s*', r' [', text)
|
|
1479
|
+
text=re.sub(r'\s*\{\s*', r' {', text)
|
|
1480
|
+
text=re.sub(r'\s*\<\s*', r' <', text)
|
|
1481
|
+
# 处理多余的空格
|
|
1482
|
+
text=re.sub(r'\s{2,}', ' ', text)
|
|
1483
|
+
text=re.sub(r'-{2,}', '-', text)
|
|
1484
|
+
return text
|
|
1485
|
+
|
|
1486
|
+
def clean_text_with_abbreviations(text):
|
|
1487
|
+
import re
|
|
1488
|
+
text=clean_text(text)
|
|
1489
|
+
matches=[]
|
|
1490
|
+
for seg in text.split():
|
|
1491
|
+
if "." in seg:
|
|
1492
|
+
if seg.endswith(".") is False:
|
|
1493
|
+
matches.append(seg)
|
|
1494
|
+
elif seg.endswith("..") and "..." not in seg:
|
|
1495
|
+
text=text.replace("..", ".")
|
|
1496
|
+
|
|
1497
|
+
for match in matches:
|
|
1498
|
+
if any(word in match for word in pgs_abbres_words):
|
|
1499
|
+
inter=match.split(".")
|
|
1500
|
+
new_match="".join([w+"." for w in inter[0:-1]])+" "+inter[-1]
|
|
1501
|
+
text=text.replace(match, new_match)
|
|
1502
|
+
else:
|
|
1503
|
+
text=text.replace(match, match.replace(".",". "))
|
|
1504
|
+
text=re.sub(r'\s+\.', '.', text)
|
|
1505
|
+
return text
|
|
1506
|
+
|
|
1507
|
+
import shutil
|
|
1508
|
+
def move_file(source_file, destination_folder, new_file_name=None):
|
|
1509
|
+
"""
|
|
1510
|
+
Move/cut a file to another folder.
|
|
1511
|
+
|
|
1512
|
+
Parameters:
|
|
1513
|
+
source_file (str): The path to the source file.
|
|
1514
|
+
destination_folder (str): The path to the destination folder.
|
|
1515
|
+
new_file_name (str, optional): The new name for the file in the destination folder. Defaults to None.
|
|
1516
|
+
"""
|
|
1517
|
+
# Ensure the destination folder exists
|
|
1518
|
+
if not os.path.exists(destination_folder):
|
|
1519
|
+
os.makedirs(destination_folder)
|
|
1520
|
+
|
|
1521
|
+
# Construct the destination file path
|
|
1522
|
+
if new_file_name:
|
|
1523
|
+
destination_file=os.path.join(destination_folder, new_file_name)
|
|
1524
|
+
else:
|
|
1525
|
+
destination_file=os.path.join(destination_folder, os.path.basename(source_file))
|
|
1526
|
+
|
|
1527
|
+
# Move the file to the destination folder
|
|
1528
|
+
shutil.move(source_file, destination_file)
|
|
1529
|
+
|
|
1530
|
+
print(f"File moved from {source_file} to {destination_file}")
|
|
1531
|
+
|
|
1532
|
+
def check_empty_cells(file_path):
|
|
1533
|
+
"""
|
|
1534
|
+
Check for any empty cells in an Excel file and return their exact positions.
|
|
1535
|
+
|
|
1536
|
+
Parameters:
|
|
1537
|
+
file_path (str): The path to the Excel file.
|
|
1538
|
+
|
|
1539
|
+
Returns:
|
|
1540
|
+
list of tuples: A list of tuples where each tuple contains the column ID and row ID of an empty cell. If no empty cells are found, an empty list is returned.
|
|
1541
|
+
|
|
1542
|
+
Example:
|
|
1543
|
+
empty_cells=check_empty_cells('your_file.xlsx')
|
|
1544
|
+
if empty_cells:
|
|
1545
|
+
print(f"Empty cells found at positions: {empty_cells}")
|
|
1546
|
+
else:
|
|
1547
|
+
print("No empty cells found.")
|
|
1548
|
+
"""
|
|
1549
|
+
# Read the Excel file
|
|
1550
|
+
df=pd.read_excel(file_path)
|
|
1551
|
+
|
|
1552
|
+
# Initialize a list to store the positions of empty cells
|
|
1553
|
+
empty_cells=[]
|
|
1554
|
+
|
|
1555
|
+
# Iterate over the DataFrame to find empty cells
|
|
1556
|
+
for row_id, row in df.iterrows():
|
|
1557
|
+
for col_id, value in row.items():
|
|
1558
|
+
if pd.isnull(value):
|
|
1559
|
+
empty_cells.append((col_id, row_id))
|
|
1560
|
+
|
|
1561
|
+
return empty_cells
|
|
1562
|
+
|
|
1563
|
+
def makefile(file_path):
|
|
1564
|
+
if os.path.exists(file_path):
|
|
1565
|
+
pass
|
|
1566
|
+
else:
|
|
1567
|
+
write_to_txt(file_path, "")
|
|
1568
|
+
|
|
1569
|
+
|
|
1570
|
+
def save_dict_to_excel(data, output_file, headers=None):
|
|
1571
|
+
"""
|
|
1572
|
+
Save Python dictionary data into an Excel .xlsx file with custom headers.
|
|
1573
|
+
|
|
1574
|
+
Parameters:
|
|
1575
|
+
data (dict): The dictionary containing the data to be saved.
|
|
1576
|
+
output_file (str): The path to the output Excel file.
|
|
1577
|
+
headers (list of str, optional): A list of strings representing the headers for the Excel file. Defaults to ['Key', 'Value'] if not provided.
|
|
1578
|
+
|
|
1579
|
+
Returns:
|
|
1580
|
+
None
|
|
1581
|
+
|
|
1582
|
+
Example:
|
|
1583
|
+
data={'key1': 'value1', 'key2': 'value2'}
|
|
1584
|
+
output_file='output.xlsx'
|
|
1585
|
+
save_dict_to_excel(data, output_file) # Uses default headers
|
|
1586
|
+
save_dict_to_excel(data, output_file, headers=['Source Text', 'Target Text']) # Uses custom headers
|
|
1587
|
+
"""
|
|
1588
|
+
if headers is None:
|
|
1589
|
+
headers=['Key', 'Value']
|
|
1590
|
+
elif len(headers) != 2:
|
|
1591
|
+
raise ValueError("Headers list must contain exactly 2 elements.")
|
|
1592
|
+
|
|
1593
|
+
# Convert the dictionary to a DataFrame
|
|
1594
|
+
df=pd.DataFrame(list(data.items()), columns=headers)
|
|
1595
|
+
|
|
1596
|
+
# Save the DataFrame to an Excel file
|
|
1597
|
+
df.to_excel(output_file, index=False)
|
|
1598
|
+
|
|
1599
|
+
def len_rows(file_path):
|
|
1600
|
+
"""
|
|
1601
|
+
Calculate the number of rows in an Excel file based on the largest row number of any possible columns.
|
|
1602
|
+
|
|
1603
|
+
Parameters:
|
|
1604
|
+
file_path (str): The path to the Excel file.
|
|
1605
|
+
|
|
1606
|
+
Returns:
|
|
1607
|
+
int: The number of rows in the Excel file.
|
|
1608
|
+
"""
|
|
1609
|
+
# Read the Excel file
|
|
1610
|
+
df=pd.read_excel(file_path)
|
|
1611
|
+
|
|
1612
|
+
# Get the number of rows
|
|
1613
|
+
row_count=df.shape[0]
|
|
1614
|
+
|
|
1615
|
+
return row_count
|
|
1616
|
+
|
|
1617
|
+
def format_float(number, decimal_places=2):
|
|
1618
|
+
"""
|
|
1619
|
+
Format a float to a specified number of decimal places.
|
|
1620
|
+
|
|
1621
|
+
Parameters:
|
|
1622
|
+
number (float): The float number to be formatted.
|
|
1623
|
+
decimal_places (int, optional): The number of decimal places to format the number to. Defaults to 2.
|
|
1624
|
+
|
|
1625
|
+
Returns:
|
|
1626
|
+
str: The formatted number as a string with the specified number of decimal places.
|
|
1627
|
+
|
|
1628
|
+
Example:
|
|
1629
|
+
formatted_number=format_float(3.1415926535)
|
|
1630
|
+
print(formatted_number) # Output: 3.14
|
|
1631
|
+
|
|
1632
|
+
formatted_number=format_float(3.1415926535, 4)
|
|
1633
|
+
print(formatted_number) # Output: 3.1416
|
|
1634
|
+
"""
|
|
1635
|
+
formatted_number="{:.{precision}f}".format(number, precision=decimal_places)
|
|
1636
|
+
return formatted_number
|
|
1637
|
+
|
|
1638
|
+
def get_data_html_offline(file_path):
|
|
1639
|
+
"""
|
|
1640
|
+
Reads a local HTML file and extracts specific elements.
|
|
1641
|
+
Parameters:
|
|
1642
|
+
file_path (str): The path to the local HTML file. my_html="Top 5 Web Scraping Methods_ Including Using LLMs - Comet.mhtml"
|
|
1643
|
+
|
|
1644
|
+
Returns: html
|
|
1645
|
+
|
|
1646
|
+
XPath common usages:
|
|
1647
|
+
rst = html.xpath('//div[@class="image-caption"]/text()') # Get the text content of the specified tag
|
|
1648
|
+
rst = html.xpath('//div[@class="image-caption"]/text()') # Get the text content of the specified tag
|
|
1649
|
+
rst1 = html.xpath('//div[@class="_16zCst"]/h1/text()')
|
|
1650
|
+
rst2 = html.xpath('//p[1]/text()') # Get the text content of the first p node
|
|
1651
|
+
rst3 = html.xpath('//p[position()<3]/text()') # Get the text content of the first two p nodes
|
|
1652
|
+
rst4 = html.xpath('//p[last()]/text()') # Get the text content of the last p node
|
|
1653
|
+
rst5 = html.xpath('//a[2]/@href') # Get the href attribute of the second a node
|
|
1654
|
+
|
|
1655
|
+
"""
|
|
1656
|
+
if file_path.endswith(".mhtml"):
|
|
1657
|
+
import pimht
|
|
1658
|
+
mhtml = pimht.from_filename(file_path)
|
|
1659
|
+
longest_length = 0
|
|
1660
|
+
html_content = ""
|
|
1661
|
+
for mhtml_part in mhtml:
|
|
1662
|
+
if "text/html" in mhtml_part.content_type:
|
|
1663
|
+
possible_html=mhtml_part.text
|
|
1664
|
+
current_length = len(possible_html)
|
|
1665
|
+
if current_length > longest_length:
|
|
1666
|
+
longest_length = current_length
|
|
1667
|
+
html_content = possible_html
|
|
1668
|
+
# Parse the HTML content
|
|
1669
|
+
html = etree.HTML(html_content)
|
|
1670
|
+
else: #.html
|
|
1671
|
+
html=etree.parse(file_path,etree.HTMLParser())
|
|
1672
|
+
return html
|
|
1673
|
+
|
|
1674
|
+
def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=None, params=None, proxies=None):
|
|
1675
|
+
'''
|
|
1676
|
+
rst = html.xpath('//div[@class="image-caption"]/text()') # Get the text content of the specified tag
|
|
1677
|
+
rst = html.xpath('//div[@class="image-caption"]/text()') # Get the text content of the specified tag
|
|
1678
|
+
rst1 = html.xpath('//div[@class="_16zCst"]/h1/text()')
|
|
1679
|
+
rst2 = html.xpath('//p[1]/text()') # Get the text content of the first p node
|
|
1680
|
+
rst3 = html.xpath('//p[position()<3]/text()') # Get the text content of the first two p nodes
|
|
1681
|
+
rst4 = html.xpath('//p[last()]/text()') # Get the text content of the last p node
|
|
1682
|
+
rst5 = html.xpath('//a[2]/@href') # Get the href attribute of the second a node
|
|
1683
|
+
'''
|
|
1684
|
+
# Example HTML content
|
|
1685
|
+
if timeout is None:
|
|
1686
|
+
real_timeout=24.0
|
|
1687
|
+
else:
|
|
1688
|
+
real_timeout=timeout
|
|
1689
|
+
try:
|
|
1690
|
+
time.sleep(round(random.uniform(1.0, 3.9), 19))
|
|
1691
|
+
r=requests.get(url, timeout=real_timeout, headers=headers, cookies=cookies, params=params, proxies=proxies)
|
|
1692
|
+
print(r.status_code) # print the reponse status code
|
|
1693
|
+
if r.status_code==200:
|
|
1694
|
+
if html==False:
|
|
1695
|
+
return r
|
|
1696
|
+
else:
|
|
1697
|
+
r.encoding="utf-8"
|
|
1698
|
+
data=r.text
|
|
1699
|
+
html=etree.HTML(data)
|
|
1700
|
+
return html
|
|
1701
|
+
else:
|
|
1702
|
+
print(r.status_code, "Can not find the page!")
|
|
1703
|
+
return None
|
|
1704
|
+
except Exception as err:
|
|
1705
|
+
print(err)
|
|
1706
|
+
|
|
1707
|
+
def find_table_with_most_rows(tables):
|
|
1708
|
+
max_rows=0
|
|
1709
|
+
max_table_index=-1
|
|
1710
|
+
for i, table in enumerate(tables):
|
|
1711
|
+
if isinstance(table, pd.DataFrame) and table.shape[0] > max_rows:
|
|
1712
|
+
max_rows=table.shape[0]
|
|
1713
|
+
max_table_index=i
|
|
1714
|
+
return max_table_index, max_rows if max_table_index!= -1 else None
|
|
1715
|
+
|
|
1716
|
+
def get_data_table(url, output_file, most_rows=True):
|
|
1717
|
+
try:
|
|
1718
|
+
tables=pd.read_html(url)
|
|
1719
|
+
if most_rows==False:
|
|
1720
|
+
# 1. default: the first table
|
|
1721
|
+
df=tables[0]
|
|
1722
|
+
else:
|
|
1723
|
+
# 2. get the table with most rows
|
|
1724
|
+
target_table=find_table_with_most_rows(tables)[0] # (1, 32)
|
|
1725
|
+
df=tables[target_table]
|
|
1726
|
+
|
|
1727
|
+
df.to_excel(output_file, index=False)
|
|
1728
|
+
print(f"Data has been saved to {output_file}")
|
|
1729
|
+
except Exception as err:
|
|
1730
|
+
print(f"Errors found! {err}")
|
|
1731
|
+
return None
|
PgsFile/__init__.py
CHANGED
|
@@ -1,29 +1,45 @@
|
|
|
1
|
+
# 1. Web scraping
|
|
1
2
|
from .PgsFile import PGScraper
|
|
3
|
+
from .PgsFile import audiovisual_downloader
|
|
2
4
|
|
|
5
|
+
# 2. Package/library management
|
|
3
6
|
from .PgsFile import install_package, uninstall_package
|
|
4
7
|
from .PgsFile import run_script, run_command
|
|
5
8
|
|
|
9
|
+
# 3. Text data retrieval
|
|
6
10
|
from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
|
|
7
11
|
from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
|
|
12
|
+
from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
|
|
8
13
|
|
|
9
|
-
|
|
14
|
+
# 4. Text data storage
|
|
15
|
+
from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, save_dict_to_excel
|
|
10
16
|
|
|
11
|
-
|
|
17
|
+
# 5. File/folder process
|
|
18
|
+
from .PgsFile import FilePath, FileName, DirList
|
|
19
|
+
from .PgsFile import get_subfolder_path, get_package_path
|
|
20
|
+
from .PgsFile import makedirec, makefile
|
|
12
21
|
from .PgsFile import source_path, next_folder_names, corpus_root, get_directory_tree_with_meta, find_txt_files_with_keyword
|
|
13
|
-
from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line
|
|
22
|
+
from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
|
|
14
23
|
|
|
15
|
-
|
|
24
|
+
# 6. Data cleaning
|
|
25
|
+
from .PgsFile import BigPunctuation, StopTags, Special, yhd
|
|
16
26
|
from .PgsFile import ZhStopWords, EnPunctuation, extract_stopwords
|
|
17
|
-
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008
|
|
27
|
+
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
|
|
28
|
+
from .PgsFile import check_contain_chinese, check_contain_number
|
|
29
|
+
from .PgsFile import replace_chinese_punctuation_with_english
|
|
30
|
+
from .PgsFile import replace_english_punctuation_with_chinese
|
|
31
|
+
from .PgsFile import clean_list, clean_text_with_abbreviations
|
|
32
|
+
from .PgsFile import extract_chinese_punctuation, generate_password, sort_strings_with_embedded_numbers
|
|
18
33
|
|
|
19
|
-
|
|
34
|
+
# 7. NLP (natural language processing)
|
|
35
|
+
from .PgsFile import strQ2B_raw, strQ2B_words
|
|
36
|
+
from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
|
|
20
37
|
from .PgsFile import word_list, batch_word_list
|
|
21
38
|
from .PgsFile import cs, cs1, cs2
|
|
22
39
|
|
|
23
|
-
|
|
24
|
-
from .PgsFile import
|
|
25
|
-
from .PgsFile import
|
|
26
|
-
from .PgsFile import
|
|
27
|
-
from .PgsFile import clean_list, yhd, extract_chinese_punctuation, generate_password, sort_strings_with_embedded_numbers
|
|
40
|
+
# 8. Maths
|
|
41
|
+
from .PgsFile import len_rows, check_empty_cells
|
|
42
|
+
from .PgsFile import format_float, decimal_to_percent, Percentage
|
|
43
|
+
from .PgsFile import get_text_length_kb, extract_numbers
|
|
28
44
|
|
|
29
45
|
name = "PgsFile"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PgsFile
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
|
|
3
|
+
Version: 0.1.6
|
|
4
|
+
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
|
|
5
5
|
Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
|
|
6
6
|
Author: Pan Guisheng
|
|
7
7
|
Author-email: 895284504@qq.com
|
|
@@ -18,8 +18,9 @@ Requires-Dist: python-docx
|
|
|
18
18
|
Requires-Dist: pip
|
|
19
19
|
Requires-Dist: requests
|
|
20
20
|
Requires-Dist: lxml
|
|
21
|
+
Requires-Dist: pimht
|
|
21
22
|
|
|
22
|
-
Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
|
|
23
|
+
Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
Function 1: Enables efficient data retrieval and storage in files with a single line of code.
|
|
@@ -34,7 +35,7 @@ Function 5: This library provides support for common text cleaning tasks, such a
|
|
|
34
35
|
|
|
35
36
|
Function 6: It also manages Python package installations and uninstallations, and allows running scripts and commands in Python interactive command lines instead of Windows command prompt.
|
|
36
37
|
|
|
37
|
-
Function 7:
|
|
38
|
+
Function 7: Download audiovisual files like videos, images, and audio using audiovisual_downloader, which is extremely useful and efficient. Additionally, scrape newspaper data with PGScraper, a highly efficient tool for this purpose.
|
|
38
39
|
|
|
39
40
|
Table 1: The directory and size of Pgs-Corpora
|
|
40
41
|
├── Idioms (1, 171.78 KB)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256=
|
|
1
|
+
PgsFile/PgsFile.py,sha256=jmSiczDE5cV47tHpCGDwLn19C90NGQtQ2vEn4ys4NUg,80514
|
|
2
|
+
PgsFile/__init__.py,sha256=EKhIRd2tktjyrvBlBPgQsIJTqU7DdLIobNG8gEiZ--0,2163
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
|
|
|
2618
2618
|
PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
|
|
2619
2619
|
PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
|
|
2620
2620
|
PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
|
|
2621
|
-
PgsFile-0.1.
|
|
2622
|
-
PgsFile-0.1.
|
|
2623
|
-
PgsFile-0.1.
|
|
2624
|
-
PgsFile-0.1.
|
|
2625
|
-
PgsFile-0.1.
|
|
2621
|
+
PgsFile-0.1.6.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2622
|
+
PgsFile-0.1.6.dist-info/METADATA,sha256=T0mBPq7PnljEcGjLItIJ3RIcZk7veOuy0vVgLuo31lo,4902
|
|
2623
|
+
PgsFile-0.1.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2624
|
+
PgsFile-0.1.6.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2625
|
+
PgsFile-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|