PgsFile 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +359 -157
- PgsFile/__init__.py +32 -17
- {PgsFile-0.1.5.dist-info → PgsFile-0.1.7.dist-info}/METADATA +4 -2
- {PgsFile-0.1.5.dist-info → PgsFile-0.1.7.dist-info}/RECORD +7 -7
- {PgsFile-0.1.5.dist-info → PgsFile-0.1.7.dist-info}/LICENSE +0 -0
- {PgsFile-0.1.5.dist-info → PgsFile-0.1.7.dist-info}/WHEEL +0 -0
- {PgsFile-0.1.5.dist-info → PgsFile-0.1.7.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -68,7 +68,7 @@ def get_data_text(path):
|
|
|
68
68
|
----------
|
|
69
69
|
path : TYPE string
|
|
70
70
|
DESCRIPTION.
|
|
71
|
-
Using path to get data from a single txt file. eg.
|
|
71
|
+
Using path to get data from a single txt file. eg. raw_text.txt
|
|
72
72
|
Theoretically, it supports all the text encoding formats, like utf-8, unicode, ansi, gbk etc.
|
|
73
73
|
|
|
74
74
|
Returns
|
|
@@ -95,7 +95,7 @@ def get_data_lines(path):
|
|
|
95
95
|
----------
|
|
96
96
|
path : TYPE string
|
|
97
97
|
DESCRIPTION.
|
|
98
|
-
Using path to get data from a single txt file. eg.
|
|
98
|
+
Using path to get data from a single txt file. eg. raw_text.txt
|
|
99
99
|
Theoretically, it supports all the text encoding formats, like utf-8, unicode, ansi, gbk etc.
|
|
100
100
|
|
|
101
101
|
Returns
|
|
@@ -152,7 +152,7 @@ def get_data_excel(excel_path,column_id,sheet_name=None):
|
|
|
152
152
|
Parameters
|
|
153
153
|
----------
|
|
154
154
|
excel_path : TYPE
|
|
155
|
-
DESCRIPTION.
|
|
155
|
+
DESCRIPTION. data_python.xlsx
|
|
156
156
|
|
|
157
157
|
column_id : TYPE Int 0,1,2,3
|
|
158
158
|
DESCRIPTION. 0 means the first column, 1 means the second.
|
|
@@ -180,7 +180,7 @@ def write_to_excel(excel_path,dic_of_list,sheet_name=None,index=None):
|
|
|
180
180
|
Parameters
|
|
181
181
|
----------
|
|
182
182
|
excel_path : TYPE
|
|
183
|
-
DESCRIPTION.
|
|
183
|
+
DESCRIPTION. results.xlsx
|
|
184
184
|
|
|
185
185
|
dic_of_list : TYPE
|
|
186
186
|
DESCRIPTION. {"col":["a","b","c","d"],"freq":[1,2,3,4]}
|
|
@@ -233,7 +233,7 @@ def get_tsv_lines(csv_path, delimiter=None):
|
|
|
233
233
|
'''
|
|
234
234
|
Parameters
|
|
235
235
|
----------
|
|
236
|
-
get_tsv_lines : TYPE
|
|
236
|
+
get_tsv_lines : TYPE data.tsv
|
|
237
237
|
DESCRIPTION.
|
|
238
238
|
|
|
239
239
|
Returns
|
|
@@ -261,7 +261,7 @@ def get_data_json(json_path):
|
|
|
261
261
|
'''
|
|
262
262
|
Parameters
|
|
263
263
|
----------
|
|
264
|
-
json_path : TYPE
|
|
264
|
+
json_path : TYPE data.json
|
|
265
265
|
DESCRIPTION.
|
|
266
266
|
|
|
267
267
|
Returns
|
|
@@ -285,7 +285,7 @@ def get_json_lines(json_path):
|
|
|
285
285
|
'''
|
|
286
286
|
Parameters
|
|
287
287
|
----------
|
|
288
|
-
json_path : TYPE
|
|
288
|
+
json_path : TYPE data.json
|
|
289
289
|
DESCRIPTION.
|
|
290
290
|
|
|
291
291
|
Returns
|
|
@@ -308,7 +308,7 @@ def write_to_json(json_path,my_dic):
|
|
|
308
308
|
Parameters
|
|
309
309
|
----------
|
|
310
310
|
json_path : TYPE string
|
|
311
|
-
DESCRIPTION.
|
|
311
|
+
DESCRIPTION. data.json
|
|
312
312
|
|
|
313
313
|
my_dic : TYPE dict or list
|
|
314
314
|
DESCRIPTION.
|
|
@@ -332,7 +332,7 @@ def write_to_json_lines(json_path,my_json_data):
|
|
|
332
332
|
Parameters
|
|
333
333
|
----------
|
|
334
334
|
json_path : TYPE string
|
|
335
|
-
DESCRIPTION.
|
|
335
|
+
DESCRIPTION. data.json
|
|
336
336
|
|
|
337
337
|
my_json_data : TYPE dict or list
|
|
338
338
|
DESCRIPTION.
|
|
@@ -358,6 +358,20 @@ def write_to_json_lines(json_path,my_json_data):
|
|
|
358
358
|
file.write(json_str + '\n')
|
|
359
359
|
file.close()
|
|
360
360
|
|
|
361
|
+
|
|
362
|
+
# Function to append a dictionary to a JSON file
|
|
363
|
+
def append_dict_to_json(file_path, data_dict):
|
|
364
|
+
try:
|
|
365
|
+
import json
|
|
366
|
+
with open(file_path, 'a', encoding="utf-8") as file:
|
|
367
|
+
json_string = json.dumps(data_dict, ensure_ascii=False)
|
|
368
|
+
file.write(json_string + '\n')
|
|
369
|
+
# print(f"Dictionary appended to {file_path}")
|
|
370
|
+
except IOError as e:
|
|
371
|
+
print(f"An I/O error occurred: {e}")
|
|
372
|
+
except Exception as e:
|
|
373
|
+
print(f"An error occurred: {e}")
|
|
374
|
+
|
|
361
375
|
def FilePath(root):
|
|
362
376
|
'''读取所有文件,列出每个文件的路径'''
|
|
363
377
|
import os
|
|
@@ -449,7 +463,7 @@ def get_subfolder_path(parent_folder, subfolder_name):
|
|
|
449
463
|
BigPunctuation="""!"#$&\'()*+,-/:;<=>?@[\\]^_`{|}.%~"#$%&'?。()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。``''""" #除去英文标点.%
|
|
450
464
|
StopTags="""◆: 、/ 。/ ---/ -/ --/ -- :/ ;/ ?/ ??/ ?┖ @/ [/ ]/ ^/ ‘/ ’/ "/ "/ 〈/ 〉/ 《/ 》/ 【/ 】/ >/ ∶/ ■/ ●/ ·/ …/ !/ #/ %,/ %/ \'/ (/ )/ */ +/ ,/ -/ // np v n w m a x t q j ni ns d i f u p g nz c r id s k h o e / #?/ --/""" #用来停用词性标注
|
|
451
465
|
Special="""∶ ■ ● ① ② ③ × ℃ Ⅲ ④ ⑤ ◆ ⑥ ± ⑦ ⑧ → ⑨ ▲ ⑩ ─ ÷ μ γ β Ⅱ Ⅰ ‰ □ 〇 ○ Ⅴ Ⅳ ★ ﹐ ° ※ ︰ α ― ≠ █ о θ ω ⒈ ⒉ ⒊ н ≤ ì ǎ ≥ р т с к й а и Ⅵ é è ﹢ ﹝ ﹞ ā ⒋ ù π ◇ Ω Ф ы Я п К в у м ǒ ü á ǔ ⒌ ⒍ 䦆 Ⅹ Ⅶ ← """
|
|
452
|
-
ZhStopWords="""——— 》), )÷(1- ”, )、 =( : → ℃ & * 一一 ~~~~ ’ . 『 .一 ./ -- 』 =″ 【 [*] }> [⑤]] [①D] c] ng昉 * // [ ] [②e] [②g] ={ } ,也 ‘ A [①⑥] [②B] [①a] [④a] [①③] [③h] ③] 1. -- [②b] ’‘ ××× [①⑧] 0:2 =[ [⑤b] [②c] [④b] [②③] [③a] [④c] [①⑤] [①⑦] [①g] ∈[ [①⑨] [①④] [①c] [②f] [②⑧] [②①] [①C] [③c] [③g] [②⑤] [②②] 一. [①h] .数 [] [①B] 数/ [①i] [③e] [①①] [④d] [④e] [③b] [⑤a] [①A] [②⑧] [②⑦] [①d] [②j] 〕〔 ][ :// ′∈ [②④ [⑤e] 12% b] ... ................... …………………………………………………③ ZXFITL [③F] 」 [①o] ]∧′=[ ∪φ∈ ′| {- ②c } [③①] R.L. [①E] Ψ -[*]- ↑ .日 [②d] [② [②⑦] [②②] [③e] [①i] [①B] [①h] [①d] [①g] [①②] [②a] f] [⑩] a] [①e] [②h] [②⑥] [③d] [②⑩] e] 〉 】 元/吨 [②⑩] 2.3% 5:0 [①] :: [②] [③] [④] [⑤] [⑥] [⑦] [⑧] [⑨] …… —— ? 、 。 “ ” 《 》 ! , : ; ? . , . ' ? · ——— ── ? — < > ( ) 〔 〕 [ ] ( ) - + ~ × / / ① ② ③ ④ ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ Ⅲ В " ; # @ γ μ φ φ. × Δ ■ ▲ sub exp sup sub Lex # % & ' + +ξ ++ - -β < <± <Δ <λ <φ
|
|
466
|
+
ZhStopWords="""——— 》), )÷(1- ”, )、 =( : → ℃ & * 一一 ~~~~ ’ . 『 .一 ./ -- 』 =″ 【 [*] }> [⑤]] [①D] c] ng昉 * // [ ] [②e] [②g] ={ } ,也 ‘ A [①⑥] [②B] [①a] [④a] [①③] [③h] ③] 1. -- [②b] ’‘ ××× [①⑧] 0:2 =[ [⑤b] [②c] [④b] [②③] [③a] [④c] [①⑤] [①⑦] [①g] ∈[ [①⑨] [①④] [①c] [②f] [②⑧] [②①] [①C] [③c] [③g] [②⑤] [②②] 一. [①h] .数 [] [①B] 数/ [①i] [③e] [①①] [④d] [④e] [③b] [⑤a] [①A] [②⑧] [②⑦] [①d] [②j] 〕〔 ][ :// ′∈ [②④ [⑤e] 12% b] ... ................... …………………………………………………③ ZXFITL [③F] 」 [①o] ]∧′=[ ∪φ∈ ′| {- ②c } [③①] R.L. [①E] Ψ -[*]- ↑ .日 [②d] [② [②⑦] [②②] [③e] [①i] [①B] [①h] [①d] [①g] [①②] [②a] f] [⑩] a] [①e] [②h] [②⑥] [③d] [②⑩] e] 〉 】 元/吨 [②⑩] 2.3% 5:0 [①] :: [②] [③] [④] [⑤] [⑥] [⑦] [⑧] [⑨] …… —— ? 、 。 “ ” 《 》 ! , : ; ? . , . ' ? · ——— ── ? — < > ( ) 〔 〕 [ ] ( ) - + ~ × / / ① ② ③ ④ ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ Ⅲ В " ; # @ γ μ φ φ. × Δ ■ ▲ sub exp sup sub Lex # % & ' + +ξ ++ - -β < <± <Δ <λ <φ <<== =☆ =- > >λ _ ~± ~+ [⑤f] [⑤d] [②i] ≈ [②G] [①f] LI ㈧ [- ...... 〉 [③⑩] 第二 一番 一直 一个 一些 许多 种 有的是 也就是说 末##末 啊 阿 哎 哎呀 哎哟 唉 俺 俺们 按 按照 吧 吧哒 把 罢了 被 本 本着 比 比方 比如 鄙人 彼 彼此 边 别 别的 别说 并 并且 不比 不成 不单 不但 不独 不管 不光 不过 不仅 不拘 不论 不怕 不然 不如 不特 不惟 不问 不只 朝 朝着 趁 趁着 乘 冲 除 除此之外 除非 除了 此 此间 此外 从 从而 打 待 但 但是 当 当着 到 得 的 的话 等 等等 地 第 叮咚 对 对于 多 多少 而 而况 而且 而是 而外 而言 而已 尔后 反过来 反过来说 反之 非但 非徒 否则 嘎 嘎登 该 赶 个 各 各个 各位 各种 各自 给 根据 跟 故 故此 固然 关于 管 归 果然 果真 过 哈 哈哈 呵 和 何 何处 何况 何时 嘿 哼 哼唷 呼哧 乎 哗 还是 还有 换句话说 换言之 或 或是 或者 极了 及 及其 及至 即 即便 即或 即令 即若 即使 几 几时 己 既 既然 既是 继而 加之 假如 假若 假使 鉴于 将 较 较之 叫 接着 结果 借 紧接着 进而 尽 尽管 经 经过 就 就是 就是说 据 具体地说 具体说来 开始 开外 靠 咳 可 可见 可是 可以 况且 啦 来 来着 离 例如 哩 连 连同 两者 了 临 另 另外 另一方面 论 嘛 吗 慢说 漫说 冒 么 每 每当 们 莫若 某 某个 某些 拿 哪 哪边 哪儿 哪个 哪里 哪年 哪怕 哪天 哪些 哪样 那 那边 那儿 那个 那会儿 那里 那么 那么些 那么样 那时 那些 那样 乃 乃至 呢 能 你 你们 您 宁 宁可 宁肯 宁愿 哦 呕 啪达 旁人 呸 凭 凭借 其 其次 其二 其他 其它 其一 其余 其中 起 起见 起见 岂但 恰恰相反 前后 前者 且 然而 然后 然则 让 人家 任 任何 任凭 如 如此 如果 如何 如其 如若 如上所述 若 若非 若是 啥 上下 尚且 设若 设使 甚而 甚么 甚至 省得 时候 什么 什么样 使得 是 是的 首先 谁 谁知 顺 顺着 似的 虽 虽然 虽说 虽则 随 随着 所 所以 他 他们 他人 它 它们 她 她们 倘 倘或 倘然 倘若 倘使 腾 替 通过 同 同时 哇 万一 往 望 为 为何 为了 为什么 为着 喂 嗡嗡 我 我们 呜 呜呼 乌乎 无论 无宁 毋宁 嘻 吓 相对而言 像 向 向着 嘘 呀 焉 沿 沿着 要 要不 要不然 要不是 要么 要是 也 也罢 也好 一 一般 一旦 一方面 一来 一切 一样 一则 依 依照 矣 以 以便 以及 以免 以至 以至于 以致 抑或 因 因此 因而 因为 哟 用 由 由此可见 由于 有 有的 有关 有些 又 于 于是 于是乎 与 与此同时 与否 与其 越是 云云 哉 再说 再者 在 在下 咱 咱们 则 怎 怎么 怎么办 怎么样 怎样 咋 照 照着 者 这 这边 这儿 这个 这会儿 这就是说 这里 这么 这么点儿 这么些 这么样 这时 这些 这样 正如 吱 之 之类 之所以 之一 只是 只限 只要 只有 至 至于 诸位 着 着呢 自 自从 自个儿 自各儿 自己 自家 自身 综上所述 总的来看 总的来说 总的说来 总而言之 总之 纵 纵令 纵然 纵使 遵照 作为 兮 呃 呗 咚 咦 喏 啐 喔唷 嗬 嗯 嗳"""
|
|
453
467
|
EnPunctuation="""!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~"""
|
|
454
468
|
nltk_en_tags={'CC': '并列连词', 'CD': '基数词', 'DT': '限定符', 'EX': '存在词', 'FW': '外来词', 'IN': '介词或从属连词', 'JJ': '形容词', 'JJR': '比较级的形容词', 'JJS': '最高级的形容词', 'LS': '列表项标记', 'MD': '情态动词', 'NN': '名词单数', 'NNS': '名词复数', 'NNP': '专有名词', 'NNPS': '专有名词复数', 'PDT': '前置限定词', 'POS': '所有格结尾', 'PRP': '人称代词', 'PRP$': '所有格代词', 'RB': '副词', 'RBR': '副词比较级', 'RBS': '副词最高级', 'RP': '小品词', 'SYM': '符号', 'UH': '感叹词', 'VB': '动词原型', 'VBD': '动词过去式', 'VBG': '动名词或现在分词', 'VBN': '动词过去分词', 'VBP': '非第三人称单数的现在时', 'VBZ': '第三人称单数的现在时', 'WDT': '以wh开头的限定词', 'WP': '以wh开头的代词', 'WP$': '以wh开头的所有格代词', 'WRB': '以wh开头的副词', 'TO': 'to'}
|
|
455
469
|
nltk_tag_mapping={'NN': 'Noun', 'NNS': 'Noun', 'NNP': 'Noun', 'NNPS': 'Noun', 'VB': 'Verb', 'VBD': 'Verb', 'VBG': 'Verb', 'VBN': 'Verb', 'VBP': 'Verb', 'VBZ': 'Verb', 'JJ': 'Adjective', 'JJR': 'Adjective', 'JJS': 'Adjective', 'RB': 'Adverb', 'RBR': 'Adverb', 'RBS': 'Adverb', 'IN': 'Preposition', 'PRP': 'Pronoun', 'PRP$': 'Pronoun', 'DT': 'Determiner', 'CC': 'Conjunction', 'CD': 'Numeral', 'UH': 'Interjection', 'FW': 'Foreign Word', 'TO': 'Particle', 'EX': 'Existential "there"', 'MD': 'Modal Auxiliary', 'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb', 'SYM': 'Symbol', 'RP': 'Particle', 'POS': 'Possessive ending', 'PDT': 'Predeterminer', 'LS': 'List item marker', 'NIL': 'Missing tag'}
|
|
@@ -457,6 +471,8 @@ nltk_tag_mapping={'NN': 'Noun', 'NNS': 'Noun', 'NNP': 'Noun', 'NNPS': 'Noun', 'V
|
|
|
457
471
|
ICTCLAS2008={'a': '形容词', 'ad': '副形词', 'ag': '形容词性语素', 'al': '形容词性惯用语', 'an': '名形词', 'b': '区别词', 'bl': '区别词性惯用语', 'c': '连词', 'cc': '并列连词', 'd': '副词', 'dg': '副词性语素', 'dl': '副词性惯用语', 'e': '叹词', 'ew': '句末标点', 'f': '方位词', 'h': '前缀', 'k': '后缀', 'm': '数词', 'mg': '数词性语素', 'mq': '数量词', 'n': '名词', 'ng': '名词性语素', 'nl': '名词性惯用语', 'nr': '汉语人名', 'nr1': '汉语姓氏', 'nr2': '汉语名字', 'nrf': '音译人名', 'nrj': '日语人名', 'ns': '地名', 'nsf': '音译地名', 'nt': '机构团体名', 'nz': '其他专名', 'o': '拟声词', 'p': '介词', 'pba': '介词“把”', 'pbei': '介词“被”', 'q': '量词', 'qt': '时量词', 'qv': '动量词', 'r': '代词', 'rg': '代词性语素', 'rr': '人称代词', 'ry': '疑问代词', 'rys': '处所疑问代词', 'ryt': '时间疑问代词', 'ryv': '谓词性疑问代词', 'rz': '指示代词', 'rzs': '处所指示代词', 'rzt': '时间指示代词', 'rzv': '谓词性指示代词', 's': '处所词', 't': '时间词', 'tg': '时间词性语素', 'u': '助词', 'udel': '的、底', 'ude2': '地', 'ude3': '得', 'udeng': '等、等等、云云', 'udh': '......的话', 'uguo': '过', 'ule': '了', 'ulian': '连', 'uls': '来讲、来说;而言、说来', 'usuo': '所', 'uyy': '一样、一般;似的、般', 'uzhe': '着', 'uzhi': '之', 'v': '动词', 'vd': '副动词', 'vf': '趋向动词', 'vg': '动词性语素', 'vi': '不及物动词', 'vl': '动词性惯用语', 'vn': '名动词', 'vshi': '动词“是”', 'vx': '形式动词', 'vyou': '动词“有”', 'w': '标点符号', 'wd': '逗号', 'wky': '右括号', 'wkz': '左括号', 'wm': '冒号', 'wn': '顿号', 'wp': '破折号', 'ws': '省略号', 'wy': '引号', 'x': '字符串', 'y': '语气词', 'z': '状态词'}
|
|
458
472
|
thulac_tags={'n': '名词', 'np': '人名', 'ns': '地名', 'ni': '机构名', 'nz': '其它专名', 'm': '数词', 'q': '量词', 'mq': '数量词', 't': '时间词', 'f': '方位词', 's': '处所词', 'v': '动词', 'a': '形容词', 'd': '副词', 'h': '前接成分', 'k': '后接成分', 'i': '习语', 'j': '简称', 'r': '代词', 'c': '连词', 'p': '介词', 'u': '助词', 'y': '语气助词', 'e': '叹词', 'o': '拟声词', 'g': '语素', 'w': '标点', 'x': '其它'}
|
|
459
473
|
|
|
474
|
+
LangCodes={'AA': ['阿法尔语', 'Afar'], 'AB': ['阿布哈兹语', 'Abkhaz'], 'AE': ['阿维斯陀语', 'Avestan'], 'AF': ['阿非利堪斯语', 'Afrikaans'], 'AK': ['阿坎语', 'Akan, Twi-Fante'], 'AM': ['阿姆哈拉语', 'Amharic'], 'AN': ['阿拉贡语', 'Aragonese'], 'AR': ['阿拉伯语', 'Arabic'], 'AS': ['阿萨姆语', 'Assamese'], 'AV': ['阿瓦尔语', 'Avaric'], 'AY': ['艾马拉语', 'Aymara'], 'AZ': ['阿塞拜疆语', 'Azerbaijani'], 'BA': ['巴什基尔语', 'Bashkir'], 'BE': ['白俄罗斯语', 'Belarusian'], 'BG': ['保加利亚语', 'Bulgarian'], 'BH': ['比哈尔语', 'Bihari'], 'BI': ['比斯拉玛语', 'Bislama'], 'BM': ['班巴拉语', 'Bambara'], 'BN': ['孟加拉语', 'Bengali'], 'BO': ['藏语', 'Tibetan Standard, Central Tibetan'], 'BR': ['布列塔尼语', 'Breton'], 'BS': ['波斯尼亚语', 'Bosnian'], 'CA': ['加泰隆语', 'Catalan;\xa0Valencian'], 'CE': ['车臣语', 'Chechen'], 'CH': ['查莫罗语', 'Chamorro'], 'CO': ['科西嘉语', 'Corsican'], 'CR': ['克里语', 'Cree'], 'CS': ['捷克语', 'Czech'], 'CU': ['教会斯拉夫语', 'Old Church Slavonic, Church Slavic, Church Slavonic, Old Bulgarian, Old Slavonic'], 'CV': ['楚瓦什语', 'Chuvash'], 'CY': ['威尔士语', 'Welsh'], 'DA': ['丹麦语', 'Danish'], 'DE': ['德语', 'German'], 'DV': ['迪维希语', 'Divehi; Dhivehi; Maldivian;'], 'DZ': ['不丹语', 'Dzongkha'], 'EE': ['埃维语', 'Ewe'], 'EL': ['现代希腊语', 'Greek, Modern'], 'EN': ['英语', 'English'], 'EO': ['世界语', 'Esperanto'], 'ES': ['西班牙语', 'Spanish; Castilian'], 'ET': ['爱沙尼亚语', 'Estonian'], 'EU': ['巴斯克语', 'Basque'], 'FA': ['波斯语', 'Persian'], 'FF': ['富拉语', 'Fula; Fulah; Pulaar; Pular'], 'FI': ['芬兰语', 'Finnish'], 'FJ': ['斐济语', 'Fijian'], 'FO': ['法罗斯语', 'Faroese'], 'FR': ['法语', 'French'], 'FY': ['弗里西亚语', 'Western Frisian'], 'GA': ['爱尔兰语', 'Irish'], 'GD': ['盖尔语(苏格兰语)', 'Scottish Gaelic; Gaelic'], 'GL': ['加利西亚语', 'Galician'], 'GN': ['瓜拉尼语', 'Guaraní'], 'GU': ['古吉拉特语', 'Gujarati'], 'GV': ['马恩岛语', 'Manx'], 'HA': ['豪萨语', 'Hausa'], 'HE': ['希伯来语', 'Hebrew\xa0(modern)'], 'HI': ['印地语', 'Hindi'], 'HO': ['希里莫图语', 'Hiri Motu'], 'HR': ['克罗地亚语', 'Croatian'], 'HT': ['海地克里奥尔语', 'Haitian; Haitian Creole'], 'HU': ['匈牙利语', 'Hungarian'], 'HY': ['亚美尼亚语', 'Armenian'], 'HZ': ['赫雷罗语', 'Herero'], 'I.E.': ['国际语E', 'Interlingue'], 'IA': ['国际语A', 'Interlingua'], 'ID': ['印尼语', 'Indonesian'], 'IG': ['伊博语', 'Igbo'], 'II': ['四川彝语(诺苏语)', 'Nuosu'], 'IK': ['依努庇克语', 'Inupiaq'], 'IO': ['伊多语', 'Ido'], 'IS': ['冰岛语', 'Icelandic'], 'IT': ['意大利语', 'Italian'], 'IU': ['伊努伊特语', 'Inuktitut'], 'JA': ['日语', 'Japanese'], 'JV': ['爪哇语', 'Javanese'], 'KA': ['格鲁吉亚语', 'Georgian'], 'KG': ['刚果语', 'Kongo'], 'KI': ['基库尤语', 'Kikuyu, Gikuyu'], 'KJ': ['夸尼亚玛语', 'Kwanyama, Kuanyama'], 'KK': ['哈萨克语', 'Kazakh'], 'KL': ['格陵兰语', 'Kalaallisut, Greenlandic'], 'KM': ['高棉语', 'Khmer, Cambodian'], 'KN': ['坎纳达语', 'Kannada'], 'KO': ['朝鲜语', 'Korean'], 'KR': ['卡努里语', 'Kanuri'], 'KS': ['克什米尔语', 'Kashmiri'], 'KU': ['库尔德语', 'Kurdish'], 'KV': ['科米语', 'Komi'], 'KW': ['康沃尔语', 'Cornish'], 'KY': ['吉尔吉斯语', 'Kirghiz, Kyrgyz'], 'LA': ['拉丁语', 'Latin'], 'LB': ['卢森堡语', 'Luxembourgish, Letzeburgesch'], 'LG': ['干达语', 'Luganda'], 'LI': ['林堡语', 'Limburgish, Limburgan, Limburger'], 'LN': ['林加拉语', 'Lingala'], 'LO': ['老挝语', 'Lao'], 'LT': ['立陶宛语', 'Lithuanian'], 'LU': ['卢巴—加丹加语', 'Luba-Katanga'], 'LV': ['拉脱维亚语', 'Latvian'], 'MG': ['马达加斯加语', 'Malagasy'], 'MH': ['马绍尔语', 'Marshallese'], 'MI': ['毛利语', 'Māori'], 'MK': ['马其顿语', 'Macedonian'], 'ML': ['马拉亚拉姆语', 'Malayalam'], 'MN': ['蒙古语', 'Mongolian'], 'MR': ['马拉提语', 'Marathi (Marāṭhī)'], 'MS': ['马来语', 'Malay'], 'MT': ['马耳他语', 'Maltese'], 'MY': ['缅甸语', 'Burmese'], 'NA': ['瑙鲁语', 'Nauru'], 'NB': ['挪威布克摩尔语', 'Norwegian Bokmål'], 'ND': ['北恩德贝勒语', 'North Ndebele'], 'NE': ['尼泊尔语', 'Nepali'], 'NG': ['恩敦加语', 'Ndonga'], 'NL': ['荷兰语', 'Dutch'], 'NN': ['尼诺斯克挪威语', 'Norwegian Nynorsk'], 'NO': ['挪威语', 'Norwegian'], 'NR': ['南恩德贝勒语', 'South Ndebele'], 'NV': ['纳瓦霍语', 'Navajo, Navaho'], 'NY': ['尼扬贾语', 'Chichewa; Chewa; Nyanja'], 'OC': ['普罗旺斯语', 'Occitan'], 'OJ': ['奥吉布瓦语', 'Ojibwe, Ojibwa'], 'OM': ['阿芳•奥洛莫语', 'Oromo'], 'OR': ['奥利亚语', 'Oriya'], 'OS': ['奥塞梯语', 'Ossetian, Ossetic'], 'PA': ['旁遮普语', 'Panjabi, Punjabi'], 'PI': ['巴利语', 'Pāli'], 'PL': ['波兰语', 'Polish'], 'PS': ['普什图语', 'Pashto, Pushto'], 'PT': ['葡萄牙语', 'Portuguese'], 'QU': ['凯楚亚语', 'Quechua'], 'RM': ['罗曼语', 'Romansh'], 'RN': ['基隆迪语', 'Kirundi'], 'RO': ['罗马尼亚语', 'Romanian,\xa0Moldavian, Moldovan'], 'RU': ['俄语', 'Russian'], 'RW': ['基尼阿万达语', 'Kinyarwanda'], 'SA': ['梵语', 'Sanskrit (Saṁskṛta)'], 'SC': ['撒丁语', 'Sardinian'], 'SD': ['信德语', 'Sindhi'], 'SE': ['北萨摩斯语', 'Northern Sami'], 'SG': ['桑戈语', 'Sango'], 'SI': ['僧加罗语', 'Sinhala, Sinhalese'], 'SK': ['斯洛伐克语', 'Slovak'], 'SL': ['斯洛文尼亚语', 'Slovene'], 'SM': ['萨摩亚语', 'Samoan'], 'SN': ['绍纳语', 'Shona'], 'SO': ['索马里语', 'Somali'], 'SQ': ['阿尔巴尼亚语', 'Albanian'], 'SR': ['塞尔维亚语', 'Serbian'], 'SS': ['塞斯瓦特语', 'Swati'], 'ST': ['南索托语', 'Southern Sotho'], 'SU': ['巽他语', 'Sundanese'], 'SV': ['瑞典语', 'Swedish'], 'SW': ['斯瓦希里语', 'Swahili'], 'TA': ['泰米尔语', 'Tamil'], 'TE': ['泰卢固语', 'Telugu'], 'TG': ['塔吉克语', 'Tajik'], 'TH': ['泰语', 'Thai'], 'TI': ['提格里尼亚语', 'Tigrinya'], 'TK': ['土库曼语', 'Turkmen'], 'TL': ['他加禄语', 'Tagalog'], 'TN': ['塞茨瓦纳语', 'Tswana'], 'TO': ['汤加语', 'Tongan'], 'TR': ['土耳其语', 'Turkish'], 'TS': ['宗加语', 'Tsonga'], 'TT': ['塔塔尔语', 'Tatar'], 'TW': ['特威语', 'Twi'], 'TY': ['塔希提语', 'Tahitian'], 'UG': ['维吾尔语', 'Uighur, Uyghur'], 'UK': ['乌克兰语', 'Ukrainian'], 'UR': ['乌尔都语', 'Urdu'], 'UZ': ['乌兹别克语', 'Uzbek'], 'VE': ['文达语', 'Venda'], 'VI': ['越南语', 'Vietnamese'], 'VO': ['沃拉普克语', 'Volapük'], 'WA': ['瓦隆语', 'Walloon'], 'WO': ['沃洛夫语', 'Wolof'], 'XH': ['科萨语', 'Xhosa'], 'YI': ['依地语', 'Yiddish'], 'YO': ['约鲁巴语', 'Yoruba'], 'ZA': ['壮语', 'Zhuang, Chuang'], 'ZH': ['汉语(中文)', 'Chinese'], 'ZU': ['祖鲁语', 'Zulu']}
|
|
475
|
+
|
|
460
476
|
def word_list(split_words):
|
|
461
477
|
"""
|
|
462
478
|
Parameters
|
|
@@ -482,13 +498,8 @@ def batch_word_list(input_root):
|
|
|
482
498
|
----------
|
|
483
499
|
input_root : TYPE string
|
|
484
500
|
DESCRIPTION.
|
|
485
|
-
It's a folder path like
|
|
486
|
-
|
|
487
|
-
For example, the text of D:\seg_only\1.txt should be like:
|
|
488
|
-
PgsFile is Python library to facilitate Python beginners ,
|
|
489
|
-
especially instructors and students of foreign languages and literature,
|
|
490
|
-
for the convenience of easily operating txt ,
|
|
491
|
-
xlsx and json files as well as making word list .
|
|
501
|
+
It's a folder path like seg_only.
|
|
502
|
+
Based on tokenized text.
|
|
492
503
|
|
|
493
504
|
Returns
|
|
494
505
|
-------
|
|
@@ -505,7 +516,7 @@ def batch_word_list(input_root):
|
|
|
505
516
|
('literature', [1, 1]),]
|
|
506
517
|
'''
|
|
507
518
|
from PgsFile import get_data_text as gt, FilePath as fp, BigPunctuation as bp
|
|
508
|
-
# input_root=r"
|
|
519
|
+
# input_root=r"047_Scraping\seg_only"
|
|
509
520
|
file_names=fp(input_root)
|
|
510
521
|
|
|
511
522
|
from collections import defaultdict
|
|
@@ -556,19 +567,6 @@ def next_folder_names(folder):
|
|
|
556
567
|
folder_namelist=next(os.walk(folder))[1]
|
|
557
568
|
return folder_namelist
|
|
558
569
|
|
|
559
|
-
def get_package_path(package_name):
|
|
560
|
-
import site
|
|
561
|
-
import os
|
|
562
|
-
package_paths=site.getsitepackages()
|
|
563
|
-
package_path=None
|
|
564
|
-
for path in package_paths:
|
|
565
|
-
if os.path.exists(os.path.join(path, package_name)):
|
|
566
|
-
package_path=os.path.join(path, package_name)
|
|
567
|
-
break
|
|
568
|
-
|
|
569
|
-
if package_path is None:
|
|
570
|
-
raise ModuleNotFoundError(f"Package '{package_name}' not found.")
|
|
571
|
-
return package_path
|
|
572
570
|
|
|
573
571
|
def remove_empty_txts(folder_path):
|
|
574
572
|
import os
|
|
@@ -634,77 +632,6 @@ def remove_empty_last_line(folder_path):
|
|
|
634
632
|
f2.write(lines[i])
|
|
635
633
|
f2.close()
|
|
636
634
|
print(end_empty_files,str(len(end_empty_files))+" files found with last line empty!")
|
|
637
|
-
|
|
638
|
-
corpus_root=get_package_path('PgsFile')+"/Corpora"
|
|
639
|
-
def extract_stopwords(lang=None):
|
|
640
|
-
'''
|
|
641
|
-
Parameters
|
|
642
|
-
----------
|
|
643
|
-
lang : TYPE, optional string
|
|
644
|
-
DESCRIPTION. The default is None.
|
|
645
|
-
lang="english"; lang="chinese" etc.
|
|
646
|
-
|
|
647
|
-
Returns
|
|
648
|
-
-------
|
|
649
|
-
contents : TYPE list
|
|
650
|
-
DESCRIPTION. ["'ll", "'tis", "'twas", "'ve", '10', '39', 'a', "a's", 'able', 'ableabout', 'about', 'above', 'abroad', 'abst', 'accordance', 'according']
|
|
651
|
-
|
|
652
|
-
'''
|
|
653
|
-
import os
|
|
654
|
-
# Check if the folder exists
|
|
655
|
-
if not os.path.isdir(corpus_root):
|
|
656
|
-
print(f"Error: The folder '{corpus_root}' does not exist.")
|
|
657
|
-
return None
|
|
658
|
-
|
|
659
|
-
if lang is None:
|
|
660
|
-
language="english"
|
|
661
|
-
else:
|
|
662
|
-
language=lang
|
|
663
|
-
file_name=language+".txt"
|
|
664
|
-
|
|
665
|
-
# Traverse the folder recursively
|
|
666
|
-
for root, dirs, files in os.walk(corpus_root):
|
|
667
|
-
# Check if the text file exists in the current folder
|
|
668
|
-
if file_name in files:
|
|
669
|
-
# Construct the full path to the text file
|
|
670
|
-
file_path=os.path.join(root, file_name)
|
|
671
|
-
# Read the contents of the text file
|
|
672
|
-
contents=[line.strip() for line in get_data_lines(file_path)]
|
|
673
|
-
return contents
|
|
674
|
-
|
|
675
|
-
# If the text file doesn't exist in any folder, print an error message
|
|
676
|
-
print(f"Error: The file '{file_name}' does not exist in the folder '{corpus_root}' or its sub-folders.")
|
|
677
|
-
return None
|
|
678
|
-
|
|
679
|
-
pickle_root=get_package_path('PgsFile')+"/models"
|
|
680
|
-
def load_pickle_data(lang=None):
|
|
681
|
-
'''
|
|
682
|
-
Parameters
|
|
683
|
-
----------
|
|
684
|
-
lang : TYPE, optional
|
|
685
|
-
DESCRIPTION. The default is None.
|
|
686
|
-
lang="english"; lang="chinese" etc.
|
|
687
|
-
Returns
|
|
688
|
-
-------
|
|
689
|
-
data : TYPE
|
|
690
|
-
DESCRIPTION.
|
|
691
|
-
|
|
692
|
-
'''
|
|
693
|
-
import pickle
|
|
694
|
-
files=FilePath(pickle_root)
|
|
695
|
-
if lang is None:
|
|
696
|
-
language="english"
|
|
697
|
-
else:
|
|
698
|
-
language=lang
|
|
699
|
-
file_path=""
|
|
700
|
-
for file in files:
|
|
701
|
-
if language in FileName(file):
|
|
702
|
-
file_path=file
|
|
703
|
-
with open(file_path, 'rb') as handle:
|
|
704
|
-
data=pickle.load(handle)
|
|
705
|
-
return data
|
|
706
|
-
|
|
707
|
-
|
|
708
635
|
|
|
709
636
|
def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
|
|
710
637
|
"""
|
|
@@ -734,8 +661,14 @@ def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
|
|
|
734
661
|
|
|
735
662
|
# Standard sentence tokenizer.
|
|
736
663
|
def sent_tokenize(text, lang=None):
|
|
737
|
-
|
|
738
|
-
|
|
664
|
+
import pysbd
|
|
665
|
+
if lang is None:
|
|
666
|
+
lang="en"
|
|
667
|
+
else:
|
|
668
|
+
lang=lang
|
|
669
|
+
seg = pysbd.Segmenter(language=lang, clean=False)
|
|
670
|
+
sent_list = seg.segment(text)
|
|
671
|
+
return sent_list
|
|
739
672
|
|
|
740
673
|
def cs(para):
|
|
741
674
|
"""
|
|
@@ -749,11 +682,10 @@ def cs(para):
|
|
|
749
682
|
# import zhon
|
|
750
683
|
# rst=re.findall(zhon.hanzi.sentence, para)
|
|
751
684
|
# return rst #['我买了一辆车。', '妈妈做的菜,很好吃!']
|
|
752
|
-
|
|
753
|
-
para=re.sub('(
|
|
754
|
-
para=re.sub('(
|
|
755
|
-
para=re.sub('(
|
|
756
|
-
para=re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
|
|
685
|
+
para=re.sub(r'([。!?\?])([^”’])', r"\1\n\2", para) # 单字符断句符
|
|
686
|
+
para=re.sub(r'(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
|
|
687
|
+
para=re.sub(r'(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
|
|
688
|
+
para=re.sub(r'([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
|
|
757
689
|
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
|
|
758
690
|
para=para.rstrip() # 段尾如果有多余的\n就去掉它
|
|
759
691
|
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
|
|
@@ -761,18 +693,7 @@ def cs(para):
|
|
|
761
693
|
return paras
|
|
762
694
|
|
|
763
695
|
|
|
764
|
-
def cs1(
|
|
765
|
-
"""
|
|
766
|
-
#英文分句
|
|
767
|
-
using nltk model
|
|
768
|
-
---------
|
|
769
|
-
Returns
|
|
770
|
-
list
|
|
771
|
-
"""
|
|
772
|
-
return sent_tokenize(para)
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
def cs2(text):
|
|
696
|
+
def cs1(text):
|
|
776
697
|
"""
|
|
777
698
|
#英文分句
|
|
778
699
|
using regular expression
|
|
@@ -784,7 +705,7 @@ def cs2(text):
|
|
|
784
705
|
alphabets="([A-Za-z])"
|
|
785
706
|
prefixes="(Mr|St|Mrs|Ms|Dr)[.]"
|
|
786
707
|
suffixes="(Inc|Ltd|Jr|Sr|Co)"
|
|
787
|
-
starters="(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
|
708
|
+
starters=r"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
|
788
709
|
acronyms="([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
|
789
710
|
websites="[.](com|net|org|io|gov)"
|
|
790
711
|
digits="([0-9])"
|
|
@@ -796,7 +717,7 @@ def cs2(text):
|
|
|
796
717
|
text=re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
|
|
797
718
|
if "..." in text: text=text.replace("...","<prd><prd><prd>")
|
|
798
719
|
if "Ph.D" in text: text=text.replace("Ph.D.","Ph<prd>D<prd>")
|
|
799
|
-
text=re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
|
|
720
|
+
text=re.sub(r"\s" + alphabets + "[.] "," \\1<prd> ",text)
|
|
800
721
|
text=re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
|
|
801
722
|
text=re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
|
|
802
723
|
text=re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
|
|
@@ -815,7 +736,7 @@ def cs2(text):
|
|
|
815
736
|
sentences=sentences[:-1]
|
|
816
737
|
sentences=[s.strip() for s in sentences]
|
|
817
738
|
if len(sentences)==0:
|
|
818
|
-
sentences=
|
|
739
|
+
sentences=sent_tokenize(text)
|
|
819
740
|
else:
|
|
820
741
|
sentences=sentences
|
|
821
742
|
return sentences
|
|
@@ -1050,7 +971,7 @@ def replace_english_punctuation_with_chinese(text):
|
|
|
1050
971
|
# 定义英文标点和对应的中文标点的映射关系
|
|
1051
972
|
punctuation_mapping={
|
|
1052
973
|
',': ',',
|
|
1053
|
-
'.': '。',
|
|
974
|
+
# '.': '。', # 去掉!
|
|
1054
975
|
'?': '?',
|
|
1055
976
|
'!': '!',
|
|
1056
977
|
';': ';',
|
|
@@ -1073,8 +994,8 @@ def extract_misspelled_words_from_docx(file_path, mode=None):
|
|
|
1073
994
|
Parameters
|
|
1074
995
|
----------
|
|
1075
996
|
file_path : TYPE string
|
|
1076
|
-
DESCRIPTION. r"
|
|
1077
|
-
mode : TYPE, optional
|
|
997
|
+
DESCRIPTION. r"DocsMetrics for Translation Quality Assessment_A Case for Standardising Error Typologies.docx"
|
|
998
|
+
mode : TYPE, optional string
|
|
1078
999
|
DESCRIPTION.
|
|
1079
1000
|
1. The default is None, which means extracting all words with double underlines and wavy lines.
|
|
1080
1001
|
2. The "spell" mode means extracting all words with wavy red lines.
|
|
@@ -1146,13 +1067,13 @@ def get_text_length_kb(text: str) -> str:
|
|
|
1146
1067
|
Get the length of a text string in KB (kilobytes, eg.26.5 KB).
|
|
1147
1068
|
"""
|
|
1148
1069
|
# Get the length of the text in bytes
|
|
1149
|
-
text_bytes
|
|
1070
|
+
text_bytes=len(text.encode('utf-8'))
|
|
1150
1071
|
|
|
1151
1072
|
# Convert the length to KB
|
|
1152
|
-
text_kb
|
|
1153
|
-
rounded_num
|
|
1073
|
+
text_kb=text_bytes / 1024
|
|
1074
|
+
rounded_num=round(text_kb, 2)
|
|
1154
1075
|
|
|
1155
|
-
text_kb
|
|
1076
|
+
text_kb=f'{rounded_num} KB'
|
|
1156
1077
|
print(type(text_kb))
|
|
1157
1078
|
|
|
1158
1079
|
return text_kb
|
|
@@ -1179,8 +1100,8 @@ def generate_password(length: int) -> str:
|
|
|
1179
1100
|
"""
|
|
1180
1101
|
import random
|
|
1181
1102
|
# Define the set of characters to choose from
|
|
1182
|
-
character_set
|
|
1183
|
-
random_password
|
|
1103
|
+
character_set="1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+=-"
|
|
1104
|
+
random_password=''.join(random.choice(character_set) for _ in range(length))
|
|
1184
1105
|
|
|
1185
1106
|
return random_password
|
|
1186
1107
|
|
|
@@ -1196,14 +1117,14 @@ def extract_numbers(string: str) -> list:
|
|
|
1196
1117
|
"""
|
|
1197
1118
|
import re
|
|
1198
1119
|
# Define a regular expression to match one or more digits
|
|
1199
|
-
digit_pattern
|
|
1120
|
+
digit_pattern=re.compile(r'(\d+)')
|
|
1200
1121
|
|
|
1201
1122
|
# Split the input string using the regular expression
|
|
1202
|
-
fragments
|
|
1123
|
+
fragments=digit_pattern.split(string)
|
|
1203
1124
|
|
|
1204
1125
|
# Convert every other fragment to an integer (the ones that match the digit pattern)
|
|
1205
1126
|
for i in range(1, len(fragments), 2):
|
|
1206
|
-
fragments[i]
|
|
1127
|
+
fragments[i]=int(fragments[i])
|
|
1207
1128
|
|
|
1208
1129
|
return fragments
|
|
1209
1130
|
|
|
@@ -1220,7 +1141,7 @@ def sort_strings_with_embedded_numbers(strings: list) -> list:
|
|
|
1220
1141
|
list: A new list containing the sorted strings.
|
|
1221
1142
|
"""
|
|
1222
1143
|
# Sort the strings using the extract_numbers() function as the key
|
|
1223
|
-
sorted_strings
|
|
1144
|
+
sorted_strings=sorted(strings, key=extract_numbers)
|
|
1224
1145
|
|
|
1225
1146
|
return sorted_strings
|
|
1226
1147
|
|
|
@@ -1237,10 +1158,10 @@ def run_command(command: str) -> str:
|
|
|
1237
1158
|
"""
|
|
1238
1159
|
import subprocess
|
|
1239
1160
|
# Run the command and capture the output
|
|
1240
|
-
output
|
|
1161
|
+
output=subprocess.check_output(command, shell=True)
|
|
1241
1162
|
|
|
1242
1163
|
# Decode the output from bytes to string
|
|
1243
|
-
output_str
|
|
1164
|
+
output_str=output.decode()
|
|
1244
1165
|
|
|
1245
1166
|
return output_str
|
|
1246
1167
|
|
|
@@ -1248,6 +1169,7 @@ def run_command(command: str) -> str:
|
|
|
1248
1169
|
import random
|
|
1249
1170
|
import requests
|
|
1250
1171
|
from lxml import html, etree
|
|
1172
|
+
import pandas as pd
|
|
1251
1173
|
my_headers={"User-Agent": random.choice(yhd)}
|
|
1252
1174
|
class PGScraper(object):
|
|
1253
1175
|
def __init__(self):
|
|
@@ -1260,31 +1182,27 @@ class PGScraper(object):
|
|
|
1260
1182
|
valid_xpath=[]
|
|
1261
1183
|
valid_span=[]
|
|
1262
1184
|
# Example HTML content
|
|
1263
|
-
if headers is None:
|
|
1264
|
-
real_headers=my_headers
|
|
1265
|
-
else:
|
|
1266
|
-
real_headers=headers
|
|
1267
1185
|
if timeout is None:
|
|
1268
1186
|
real_timeout=24.0
|
|
1269
1187
|
else:
|
|
1270
1188
|
real_timeout=timeout
|
|
1271
1189
|
|
|
1272
|
-
r=requests.get(url,timeout=real_timeout,headers=
|
|
1190
|
+
r=requests.get(url,timeout=real_timeout,headers=headers, cookies=cookies, params=params, proxies=proxies)
|
|
1273
1191
|
if r.status_code==200:
|
|
1274
1192
|
r.encoding="utf-8"
|
|
1275
1193
|
html_content=r.content
|
|
1276
1194
|
# Parse HTML content
|
|
1277
|
-
tree
|
|
1195
|
+
tree=html.fromstring(html_content)
|
|
1278
1196
|
relative_xpaths=[]
|
|
1279
1197
|
for text in want_list:
|
|
1280
1198
|
# Find elements containing the text
|
|
1281
|
-
elements
|
|
1199
|
+
elements=tree.xpath(f"//*[contains(text(), '{text}')]")
|
|
1282
1200
|
if not elements:
|
|
1283
1201
|
return None
|
|
1284
1202
|
|
|
1285
1203
|
# Assume we want the first matching element
|
|
1286
|
-
element
|
|
1287
|
-
absolute_xpath
|
|
1204
|
+
element=elements[0]
|
|
1205
|
+
absolute_xpath=tree.getroottree().getpath(element)
|
|
1288
1206
|
relative_xpaths.append(absolute_xpath)
|
|
1289
1207
|
|
|
1290
1208
|
path1=relative_xpaths[0]
|
|
@@ -1312,7 +1230,7 @@ class PGScraper(object):
|
|
|
1312
1230
|
all_want_list.append(clean_list(target_eles))
|
|
1313
1231
|
valid_xpath.append(my_path)
|
|
1314
1232
|
except:
|
|
1315
|
-
error_type, value, traceback
|
|
1233
|
+
error_type, value, traceback=sys.exc_info()
|
|
1316
1234
|
error_info=f'{error_type}\n{value}\n{traceback}'
|
|
1317
1235
|
print(error_info)
|
|
1318
1236
|
|
|
@@ -1345,7 +1263,7 @@ class PGScraper(object):
|
|
|
1345
1263
|
all_want_list.append((clean_list(target_eles),clean_list(target_url_eles)))
|
|
1346
1264
|
valid_xpath.append((my_path,my_path_url))
|
|
1347
1265
|
except:
|
|
1348
|
-
error_type, value, traceback
|
|
1266
|
+
error_type, value, traceback=sys.exc_info()
|
|
1349
1267
|
error_info=f'{error_type}\n{value}\n{traceback}'
|
|
1350
1268
|
print(error_info)
|
|
1351
1269
|
|
|
@@ -1378,21 +1296,17 @@ class PGScraper(object):
|
|
|
1378
1296
|
def get_similar_text(self, url, timeout=None, headers=None, cookies=None, params=None, proxies=None):
|
|
1379
1297
|
all_want_list=[]
|
|
1380
1298
|
# Example HTML content
|
|
1381
|
-
if headers is None:
|
|
1382
|
-
real_headers=my_headers
|
|
1383
|
-
else:
|
|
1384
|
-
real_headers=headers
|
|
1385
1299
|
if timeout is None:
|
|
1386
1300
|
real_timeout=24.0
|
|
1387
1301
|
else:
|
|
1388
1302
|
real_timeout=timeout
|
|
1389
1303
|
|
|
1390
|
-
r=requests.get(url,timeout=real_timeout,headers=
|
|
1304
|
+
r=requests.get(url, timeout=real_timeout, headers=headers, cookies=cookies, params=params, proxies=proxies)
|
|
1391
1305
|
if r.status_code==200:
|
|
1392
1306
|
r.encoding="utf-8"
|
|
1393
1307
|
html_content=r.content
|
|
1394
1308
|
# Parse HTML content
|
|
1395
|
-
tree
|
|
1309
|
+
tree=html.fromstring(html_content)
|
|
1396
1310
|
if self.show_url==True:
|
|
1397
1311
|
for pat,url in self.pattern:
|
|
1398
1312
|
target_eles=tree.xpath(pat)
|
|
@@ -1408,6 +1322,8 @@ class PGScraper(object):
|
|
|
1408
1322
|
return all_want_list
|
|
1409
1323
|
|
|
1410
1324
|
|
|
1325
|
+
|
|
1326
|
+
|
|
1411
1327
|
# -*- coding: utf-8 -*-
|
|
1412
1328
|
"""
|
|
1413
1329
|
Created on Thu Sep 17 16:11:45 2020
|
|
@@ -1430,7 +1346,7 @@ def audiovisual_downloader(url, path):
|
|
|
1430
1346
|
p.output()
|
|
1431
1347
|
|
|
1432
1348
|
class ProgressData(object):
|
|
1433
|
-
def __init__(self, block,size, unit, file_name='', ):
|
|
1349
|
+
def __init__(self, block, size, unit, file_name='', ):
|
|
1434
1350
|
self.file_name=file_name
|
|
1435
1351
|
self.block=block/1000.0
|
|
1436
1352
|
self.size=size/1000.0
|
|
@@ -1452,3 +1368,289 @@ class ProgressData(object):
|
|
|
1452
1368
|
self.size, self.unit, progress, speed, self.unit))
|
|
1453
1369
|
print('%50s'%('/'*int((1-progress)*50)))
|
|
1454
1370
|
|
|
1371
|
+
|
|
1372
|
+
def levenshtein_distance(s, t):
|
|
1373
|
+
m, n=len(s), len(t)
|
|
1374
|
+
if m < n:
|
|
1375
|
+
s, t=t, s
|
|
1376
|
+
m, n=n, m
|
|
1377
|
+
d=[list(range(n + 1))] + [[i] + [0] * n for i in range(1, m + 1)]
|
|
1378
|
+
for j in range(1, n + 1):
|
|
1379
|
+
for i in range(1, m + 1):
|
|
1380
|
+
if s[i - 1]==t[j - 1]:
|
|
1381
|
+
d[i][j]=d[i - 1][j - 1]
|
|
1382
|
+
else:
|
|
1383
|
+
d[i][j]=min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1
|
|
1384
|
+
return d[m][n]
|
|
1385
|
+
|
|
1386
|
+
def compute_similarity(input_string, reference_string):
|
|
1387
|
+
distance=levenshtein_distance(input_string, reference_string)
|
|
1388
|
+
max_length=max(len(input_string), len(reference_string))
|
|
1389
|
+
similarity=1 - (distance / max_length)
|
|
1390
|
+
return similarity
|
|
1391
|
+
|
|
1392
|
+
pgs_abbres_words=['A.B.','A.D.','A.G.','A.I.','A.M.','A.P.','A.V.','AFP.','Ala.','Alta.','Apr.','Ariz.','Ark.','Assn.','Aug.','Ave.','B.A.','B.C','B.C.','B.Ed.','B.I.G','B.R.','B.S.','Blvd.','Brig.','Brig.-Gen.','Bros.','C.D.','C.E.O','C.I.A.','C.M.','C.V.','Calif.','Capt.','Cf.','Ch.','Cie.','Cir.','Cllr.','Cmdr.','Co.','Co.Design','Col.','Colo.','Conn.','Corp.','Cos.','Coun.','Cpl.','Cres.','D.C.','D.D.S.','D.J.','D.K.','D.S.','Dec.','Del.','Dept.','Det.','Dr.','E.B.','E.C.','E.ON','E.U.','E.coli','E.g.','Ed.','Esq.','F.C.','Feb.','Fig.','Fla.','Fri.','G.K.','G.M.','G.Skill','Ga.','Gen.','Gov.','Govt.','H.E.','H.L.','H.S.','Hon.','Hwy.','I.T.','I.e.','Ill.','Inc.','Ind.','J.Crew','J.D.','J.G.','J.P','J.R.R.','Jan.','Jr.','Jul.','Jun.','K.C.','K.J.','K.M.','K.N.','K.P.','K.R.','Kan.','Ky.','L.A.','L.L.','L.S.','LLC.','La.','Lieut.','Lt.','Lt.-Cmdr.','Lt.-Col.','Lt.-Gen.','Ltd.','M.A.','M.B.','M.B.A.','M.D.','M.E.N','M.I.A.','M.J.','M.M.','M.P.','M.S.','Maj.','Maj.-Gen.','Man.','Mar.','Mass.','Md.','Messrs.','Mfg.','Mfrs.','Mich.','Minn.','Miss.','Mmes.','Mo.','Mon.','Mr.','Mrs.','Ms.','Msgr.','Mss.','N.A.','N.B.','N.C.','N.D.','N.H.','N.J.','N.L.','N.M.','N.S.','N.W.A.','N.W.T.','N.Y.','Neb.','Nev.','No.','Nos.','Nov.','O.C.','O.K.','O.S.','Oct.','Okla.','Ont.','Op.','Ore.','P.C.','P.E.','P.E.I.','P.K.','P.M.','P.O.','P.R.','P.S.','Pa.','Ph.D','Ph.D.','Plc.','Pres.','Prof.','Psy.D.','Pte.','Que.','R.E.M.','R.I.','R.I.P.','R.M','R.R.','Rd.','Rep.','Rev.','Rs.','Rt.','S.A.','S.C.','S.D.','S.F.','S.H.I.E.L.D.','S.K.','S.League','S.M.','S.P.','Sask.','Sat.','Sec.','Sen.','Sep.','Sgt.','Sr.','St.','Ste.','Sub-Lieut.','Sun.','Supt.','T.A.','T.R.','T.V.','TV.','Tenn.','Tex.','Thu.','Tue.','Twp.','U.A.E.','U.K.','U.N','U.P.','U.S','U.S.','U.S.A.','U.S.C.','UK.','US.','V.P.','Va.','Vol.','Vt.','W.H.O.','W.Va.','Wash.','Wed.','Wis.','Y.T.','a.m.','abr.','anon.','bk.','bks.','bull.','c.','ca.','cf.','ch.','def.','e.g.','ed.','eds.','et al.','etc.','fig.','ft.','fwd.','gal.','i.e.','ibid.','illus.','in.','jour.','lb.','mag.','mi.','ms.','mss.','no.','oz.','p.','p.m.','pg.','pgs.','pp.','pseud.','pt.','pts.','pub.','qt.','qtd.','ser.','supp.','trans.','viz.','vol.','vols.','vs.','yd.']
|
|
1393
|
+
|
|
1394
|
+
def clean_text(text): #清洗除了句号以外的其他标点符号问题
|
|
1395
|
+
# 在标点符号右边邻接单词前添加空格
|
|
1396
|
+
import re
|
|
1397
|
+
text=replace_chinese_punctuation_with_english(text)
|
|
1398
|
+
text=re.sub(r'(?<=[\?\!\,\;\:\)\]\}])\s*(?=\w)', ' ', text)
|
|
1399
|
+
# 删除标点符号与左边单词之间的空格
|
|
1400
|
+
text=re.sub(r'\s*([\?\!\,\;\:\)\]\}\>])', r'\1', text)
|
|
1401
|
+
# 删除标点符号与右边单词之间的空格
|
|
1402
|
+
text=re.sub(r'\s*\(\s*', r' (', text)
|
|
1403
|
+
text=re.sub(r'\s*\[\s*', r' [', text)
|
|
1404
|
+
text=re.sub(r'\s*\{\s*', r' {', text)
|
|
1405
|
+
text=re.sub(r'\s*\<\s*', r' <', text)
|
|
1406
|
+
# 处理多余的空格
|
|
1407
|
+
text=re.sub(r'\s{2,}', ' ', text)
|
|
1408
|
+
text=re.sub(r'-{2,}', '-', text)
|
|
1409
|
+
return text
|
|
1410
|
+
|
|
1411
|
+
def clean_text_with_abbreviations(text):
|
|
1412
|
+
import re
|
|
1413
|
+
text=clean_text(text)
|
|
1414
|
+
matches=[]
|
|
1415
|
+
for seg in text.split():
|
|
1416
|
+
if "." in seg:
|
|
1417
|
+
if seg.endswith(".") is False:
|
|
1418
|
+
matches.append(seg)
|
|
1419
|
+
elif seg.endswith("..") and "..." not in seg:
|
|
1420
|
+
text=text.replace("..", ".")
|
|
1421
|
+
|
|
1422
|
+
for match in matches:
|
|
1423
|
+
if any(word in match for word in pgs_abbres_words):
|
|
1424
|
+
inter=match.split(".")
|
|
1425
|
+
new_match="".join([w+"." for w in inter[0:-1]])+" "+inter[-1]
|
|
1426
|
+
text=text.replace(match, new_match)
|
|
1427
|
+
else:
|
|
1428
|
+
text=text.replace(match, match.replace(".",". "))
|
|
1429
|
+
text=re.sub(r'\s+\.', '.', text)
|
|
1430
|
+
return text
|
|
1431
|
+
|
|
1432
|
+
import shutil
|
|
1433
|
+
def move_file(source_file, destination_folder, new_file_name=None):
|
|
1434
|
+
"""
|
|
1435
|
+
Move/cut a file to another folder.
|
|
1436
|
+
|
|
1437
|
+
Parameters:
|
|
1438
|
+
source_file (str): The path to the source file.
|
|
1439
|
+
destination_folder (str): The path to the destination folder.
|
|
1440
|
+
new_file_name (str, optional): The new name for the file in the destination folder. Defaults to None.
|
|
1441
|
+
"""
|
|
1442
|
+
# Ensure the destination folder exists
|
|
1443
|
+
if not os.path.exists(destination_folder):
|
|
1444
|
+
os.makedirs(destination_folder)
|
|
1445
|
+
|
|
1446
|
+
# Construct the destination file path
|
|
1447
|
+
if new_file_name:
|
|
1448
|
+
destination_file=os.path.join(destination_folder, new_file_name)
|
|
1449
|
+
else:
|
|
1450
|
+
destination_file=os.path.join(destination_folder, os.path.basename(source_file))
|
|
1451
|
+
|
|
1452
|
+
# Move the file to the destination folder
|
|
1453
|
+
shutil.move(source_file, destination_file)
|
|
1454
|
+
|
|
1455
|
+
print(f"File moved from {source_file} to {destination_file}")
|
|
1456
|
+
|
|
1457
|
+
def check_empty_cells(file_path):
|
|
1458
|
+
"""
|
|
1459
|
+
Check for any empty cells in an Excel file and return their exact positions.
|
|
1460
|
+
|
|
1461
|
+
Parameters:
|
|
1462
|
+
file_path (str): The path to the Excel file.
|
|
1463
|
+
|
|
1464
|
+
Returns:
|
|
1465
|
+
list of tuples: A list of tuples where each tuple contains the column ID and row ID of an empty cell. If no empty cells are found, an empty list is returned.
|
|
1466
|
+
|
|
1467
|
+
Example:
|
|
1468
|
+
empty_cells=check_empty_cells('your_file.xlsx')
|
|
1469
|
+
if empty_cells:
|
|
1470
|
+
print(f"Empty cells found at positions: {empty_cells}")
|
|
1471
|
+
else:
|
|
1472
|
+
print("No empty cells found.")
|
|
1473
|
+
"""
|
|
1474
|
+
# Read the Excel file
|
|
1475
|
+
df=pd.read_excel(file_path)
|
|
1476
|
+
|
|
1477
|
+
# Initialize a list to store the positions of empty cells
|
|
1478
|
+
empty_cells=[]
|
|
1479
|
+
|
|
1480
|
+
# Iterate over the DataFrame to find empty cells
|
|
1481
|
+
for row_id, row in df.iterrows():
|
|
1482
|
+
for col_id, value in row.items():
|
|
1483
|
+
if pd.isnull(value):
|
|
1484
|
+
empty_cells.append((col_id, row_id))
|
|
1485
|
+
|
|
1486
|
+
return empty_cells
|
|
1487
|
+
|
|
1488
|
+
def makefile(file_path):
|
|
1489
|
+
if os.path.exists(file_path):
|
|
1490
|
+
pass
|
|
1491
|
+
else:
|
|
1492
|
+
write_to_txt(file_path, "")
|
|
1493
|
+
|
|
1494
|
+
|
|
1495
|
+
def save_dict_to_excel(data, output_file, headers=None):
|
|
1496
|
+
"""
|
|
1497
|
+
Save Python dictionary data into an Excel .xlsx file with custom headers.
|
|
1498
|
+
|
|
1499
|
+
Parameters:
|
|
1500
|
+
data (dict): The dictionary containing the data to be saved.
|
|
1501
|
+
output_file (str): The path to the output Excel file.
|
|
1502
|
+
headers (list of str, optional): A list of strings representing the headers for the Excel file. Defaults to ['Key', 'Value'] if not provided.
|
|
1503
|
+
|
|
1504
|
+
Returns:
|
|
1505
|
+
None
|
|
1506
|
+
|
|
1507
|
+
Example:
|
|
1508
|
+
data={'key1': 'value1', 'key2': 'value2'}
|
|
1509
|
+
output_file='output.xlsx'
|
|
1510
|
+
save_dict_to_excel(data, output_file) # Uses default headers
|
|
1511
|
+
save_dict_to_excel(data, output_file, headers=['Source Text', 'Target Text']) # Uses custom headers
|
|
1512
|
+
"""
|
|
1513
|
+
if headers is None:
|
|
1514
|
+
headers=['Key', 'Value']
|
|
1515
|
+
elif len(headers) != 2:
|
|
1516
|
+
raise ValueError("Headers list must contain exactly 2 elements.")
|
|
1517
|
+
|
|
1518
|
+
# Convert the dictionary to a DataFrame
|
|
1519
|
+
df=pd.DataFrame(list(data.items()), columns=headers)
|
|
1520
|
+
|
|
1521
|
+
# Save the DataFrame to an Excel file
|
|
1522
|
+
df.to_excel(output_file, index=False)
|
|
1523
|
+
|
|
1524
|
+
def len_rows(file_path):
|
|
1525
|
+
"""
|
|
1526
|
+
Calculate the number of rows in an Excel file based on the largest row number of any possible columns.
|
|
1527
|
+
|
|
1528
|
+
Parameters:
|
|
1529
|
+
file_path (str): The path to the Excel file.
|
|
1530
|
+
|
|
1531
|
+
Returns:
|
|
1532
|
+
int: The number of rows in the Excel file.
|
|
1533
|
+
"""
|
|
1534
|
+
# Read the Excel file
|
|
1535
|
+
df=pd.read_excel(file_path)
|
|
1536
|
+
|
|
1537
|
+
# Get the number of rows
|
|
1538
|
+
row_count=df.shape[0]
|
|
1539
|
+
|
|
1540
|
+
return row_count
|
|
1541
|
+
|
|
1542
|
+
def format_float(number, decimal_places=2):
|
|
1543
|
+
"""
|
|
1544
|
+
Format a float to a specified number of decimal places.
|
|
1545
|
+
|
|
1546
|
+
Parameters:
|
|
1547
|
+
number (float): The float number to be formatted.
|
|
1548
|
+
decimal_places (int, optional): The number of decimal places to format the number to. Defaults to 2.
|
|
1549
|
+
|
|
1550
|
+
Returns:
|
|
1551
|
+
str: The formatted number as a string with the specified number of decimal places.
|
|
1552
|
+
|
|
1553
|
+
Example:
|
|
1554
|
+
formatted_number=format_float(3.1415926535)
|
|
1555
|
+
print(formatted_number) # Output: 3.14
|
|
1556
|
+
|
|
1557
|
+
formatted_number=format_float(3.1415926535, 4)
|
|
1558
|
+
print(formatted_number) # Output: 3.1416
|
|
1559
|
+
"""
|
|
1560
|
+
formatted_number="{:.{precision}f}".format(number, precision=decimal_places)
|
|
1561
|
+
return formatted_number
|
|
1562
|
+
|
|
1563
|
+
def get_data_html_offline(file_path):
|
|
1564
|
+
"""
|
|
1565
|
+
Reads a local HTML file and extracts specific elements.
|
|
1566
|
+
Parameters:
|
|
1567
|
+
file_path (str): The path to the local HTML file. my_html="Top 5 Web Scraping Methods_ Including Using LLMs - Comet.mhtml"
|
|
1568
|
+
|
|
1569
|
+
Returns: html
|
|
1570
|
+
|
|
1571
|
+
XPath common usages:
|
|
1572
|
+
rst = html.xpath('//div[@class="image-caption"]/text()') # Get the text content of the specified tag
|
|
1573
|
+
rst = html.xpath('//div[@class="image-caption"]/text()') # Get the text content of the specified tag
|
|
1574
|
+
rst1 = html.xpath('//div[@class="_16zCst"]/h1/text()')
|
|
1575
|
+
rst2 = html.xpath('//p[1]/text()') # Get the text content of the first p node
|
|
1576
|
+
rst3 = html.xpath('//p[position()<3]/text()') # Get the text content of the first two p nodes
|
|
1577
|
+
rst4 = html.xpath('//p[last()]/text()') # Get the text content of the last p node
|
|
1578
|
+
rst5 = html.xpath('//a[2]/@href') # Get the href attribute of the second a node
|
|
1579
|
+
|
|
1580
|
+
"""
|
|
1581
|
+
if file_path.endswith(".mhtml"):
|
|
1582
|
+
import pimht
|
|
1583
|
+
mhtml = pimht.from_filename(file_path)
|
|
1584
|
+
longest_length = 0
|
|
1585
|
+
html_content = ""
|
|
1586
|
+
for mhtml_part in mhtml:
|
|
1587
|
+
if "text/html" in mhtml_part.content_type:
|
|
1588
|
+
possible_html=mhtml_part.text
|
|
1589
|
+
current_length = len(possible_html)
|
|
1590
|
+
if current_length > longest_length:
|
|
1591
|
+
longest_length = current_length
|
|
1592
|
+
html_content = possible_html
|
|
1593
|
+
# Parse the HTML content
|
|
1594
|
+
html = etree.HTML(html_content)
|
|
1595
|
+
else: #.html
|
|
1596
|
+
html=etree.parse(file_path,etree.HTMLParser())
|
|
1597
|
+
return html
|
|
1598
|
+
|
|
1599
|
+
def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=None, params=None, proxies=None):
|
|
1600
|
+
'''
|
|
1601
|
+
rst = html.xpath('//div[@class="image-caption"]/text()') # Get the text content of the specified tag
|
|
1602
|
+
rst = html.xpath('//div[@class="image-caption"]/text()') # Get the text content of the specified tag
|
|
1603
|
+
rst1 = html.xpath('//div[@class="_16zCst"]/h1/text()')
|
|
1604
|
+
rst2 = html.xpath('//p[1]/text()') # Get the text content of the first p node
|
|
1605
|
+
rst3 = html.xpath('//p[position()<3]/text()') # Get the text content of the first two p nodes
|
|
1606
|
+
rst4 = html.xpath('//p[last()]/text()') # Get the text content of the last p node
|
|
1607
|
+
rst5 = html.xpath('//a[2]/@href') # Get the href attribute of the second a node
|
|
1608
|
+
'''
|
|
1609
|
+
# Example HTML content
|
|
1610
|
+
if timeout is None:
|
|
1611
|
+
real_timeout=24.0
|
|
1612
|
+
else:
|
|
1613
|
+
real_timeout=timeout
|
|
1614
|
+
try:
|
|
1615
|
+
time.sleep(round(random.uniform(1.0, 3.9), 19))
|
|
1616
|
+
r=requests.get(url, timeout=real_timeout, headers=headers, cookies=cookies, params=params, proxies=proxies)
|
|
1617
|
+
print(r.status_code) # print the reponse status code
|
|
1618
|
+
if r.status_code==200:
|
|
1619
|
+
if html==False:
|
|
1620
|
+
return r
|
|
1621
|
+
else:
|
|
1622
|
+
r.encoding="utf-8"
|
|
1623
|
+
data=r.text
|
|
1624
|
+
html=etree.HTML(data)
|
|
1625
|
+
return html
|
|
1626
|
+
else:
|
|
1627
|
+
print(r.status_code, "Can not find the page!")
|
|
1628
|
+
return None
|
|
1629
|
+
except Exception as err:
|
|
1630
|
+
print(err)
|
|
1631
|
+
|
|
1632
|
+
def find_table_with_most_rows(tables):
|
|
1633
|
+
max_rows=0
|
|
1634
|
+
max_table_index=-1
|
|
1635
|
+
for i, table in enumerate(tables):
|
|
1636
|
+
if isinstance(table, pd.DataFrame) and table.shape[0] > max_rows:
|
|
1637
|
+
max_rows=table.shape[0]
|
|
1638
|
+
max_table_index=i
|
|
1639
|
+
return max_table_index, max_rows if max_table_index!= -1 else None
|
|
1640
|
+
|
|
1641
|
+
def get_data_table(url, output_file, most_rows=True):
|
|
1642
|
+
try:
|
|
1643
|
+
tables=pd.read_html(url)
|
|
1644
|
+
if most_rows==False:
|
|
1645
|
+
# 1. default: the first table
|
|
1646
|
+
df=tables[0]
|
|
1647
|
+
else:
|
|
1648
|
+
# 2. get the table with most rows
|
|
1649
|
+
target_table=find_table_with_most_rows(tables)[0] # (1, 32)
|
|
1650
|
+
df=tables[target_table]
|
|
1651
|
+
|
|
1652
|
+
df.to_excel(output_file, index=False)
|
|
1653
|
+
print(f"Data has been saved to {output_file}")
|
|
1654
|
+
except Exception as err:
|
|
1655
|
+
print(f"Errors found! {err}")
|
|
1656
|
+
return None
|
PgsFile/__init__.py
CHANGED
|
@@ -1,30 +1,45 @@
|
|
|
1
|
+
# 1. Web scraping
|
|
1
2
|
from .PgsFile import PGScraper
|
|
2
3
|
from .PgsFile import audiovisual_downloader
|
|
3
4
|
|
|
5
|
+
# 2. Package/library management
|
|
4
6
|
from .PgsFile import install_package, uninstall_package
|
|
5
7
|
from .PgsFile import run_script, run_command
|
|
6
8
|
|
|
9
|
+
# 3. Text data retrieval
|
|
7
10
|
from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
|
|
8
11
|
from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
from .PgsFile import
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
from .PgsFile import
|
|
17
|
-
from .PgsFile import
|
|
18
|
-
from .PgsFile import
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
from .PgsFile import
|
|
23
|
-
|
|
24
|
-
from .PgsFile import
|
|
12
|
+
from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
|
|
13
|
+
|
|
14
|
+
# 4. Text data storage
|
|
15
|
+
from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
|
|
16
|
+
|
|
17
|
+
# 5. File/folder process
|
|
18
|
+
from .PgsFile import FilePath, FileName, DirList
|
|
19
|
+
from .PgsFile import get_subfolder_path
|
|
20
|
+
from .PgsFile import makedirec, makefile
|
|
21
|
+
from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
|
|
22
|
+
from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
|
|
23
|
+
|
|
24
|
+
# 6. Data cleaning
|
|
25
|
+
from .PgsFile import BigPunctuation, StopTags, Special, yhd
|
|
26
|
+
from .PgsFile import ZhStopWords, EnPunctuation
|
|
27
|
+
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
|
|
25
28
|
from .PgsFile import check_contain_chinese, check_contain_number
|
|
26
29
|
from .PgsFile import replace_chinese_punctuation_with_english
|
|
27
30
|
from .PgsFile import replace_english_punctuation_with_chinese
|
|
28
|
-
from .PgsFile import clean_list,
|
|
31
|
+
from .PgsFile import clean_list, clean_text_with_abbreviations
|
|
32
|
+
from .PgsFile import extract_chinese_punctuation, generate_password, sort_strings_with_embedded_numbers
|
|
33
|
+
|
|
34
|
+
# 7. NLP (natural language processing)
|
|
35
|
+
from .PgsFile import strQ2B_raw, strQ2B_words
|
|
36
|
+
from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
|
|
37
|
+
from .PgsFile import word_list, batch_word_list
|
|
38
|
+
from .PgsFile import cs, cs1, sent_tokenize
|
|
39
|
+
|
|
40
|
+
# 8. Maths
|
|
41
|
+
from .PgsFile import len_rows, check_empty_cells
|
|
42
|
+
from .PgsFile import format_float, decimal_to_percent, Percentage
|
|
43
|
+
from .PgsFile import get_text_length_kb, extract_numbers
|
|
29
44
|
|
|
30
45
|
name = "PgsFile"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PgsFile
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
|
|
5
5
|
Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
|
|
6
6
|
Author: Pan Guisheng
|
|
@@ -18,8 +18,10 @@ Requires-Dist: python-docx
|
|
|
18
18
|
Requires-Dist: pip
|
|
19
19
|
Requires-Dist: requests
|
|
20
20
|
Requires-Dist: lxml
|
|
21
|
+
Requires-Dist: pimht
|
|
22
|
+
Requires-Dist: pysbd
|
|
21
23
|
|
|
22
|
-
Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
|
|
24
|
+
Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
Function 1: Enables efficient data retrieval and storage in files with a single line of code.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256=
|
|
1
|
+
PgsFile/PgsFile.py,sha256=6CXBDn3VC4gUkigNVCkM9eVPOe4Xyww32tG0ZDeYNfI,78446
|
|
2
|
+
PgsFile/__init__.py,sha256=TaKrLI0pGAFm_2Bzjf_cGnog_URzaAgHRW5myzY0Lz8,2144
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
|
|
|
2618
2618
|
PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
|
|
2619
2619
|
PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
|
|
2620
2620
|
PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
|
|
2621
|
-
PgsFile-0.1.
|
|
2622
|
-
PgsFile-0.1.
|
|
2623
|
-
PgsFile-0.1.
|
|
2624
|
-
PgsFile-0.1.
|
|
2625
|
-
PgsFile-0.1.
|
|
2621
|
+
PgsFile-0.1.7.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2622
|
+
PgsFile-0.1.7.dist-info/METADATA,sha256=0HAA5A68yHiB-LVlNuF-pkKo_lawzwTU-Thf-i2FiUY,4924
|
|
2623
|
+
PgsFile-0.1.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2624
|
+
PgsFile-0.1.7.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2625
|
+
PgsFile-0.1.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|