PgsFile 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +127 -1
- PgsFile/__init__.py +2 -1
- PgsFile/models/NLPIR.user +0 -0
- PgsFile/models/prompts/3. ICTCLAS Prompt.txt +8 -0
- PgsFile/models/prompts/4. OCR prompt.txt +1 -0
- {PgsFile-0.4.0.dist-info → PgsFile-0.4.2.dist-info}/METADATA +4 -4
- {PgsFile-0.4.0.dist-info → PgsFile-0.4.2.dist-info}/RECORD +10 -8
- {PgsFile-0.4.0.dist-info → PgsFile-0.4.2.dist-info}/LICENSE +0 -0
- {PgsFile-0.4.0.dist-info → PgsFile-0.4.2.dist-info}/WHEEL +0 -0
- {PgsFile-0.4.0.dist-info → PgsFile-0.4.2.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -545,6 +545,106 @@ nltk_en_tags={'CC': '并列连词', 'CD': '基数词', 'DT': '限定符', 'EX':
|
|
|
545
545
|
nltk_tag_mapping={'NN': 'Noun', 'NNS': 'Noun', 'NNP': 'Noun', 'NNPS': 'Noun', 'VB': 'Verb', 'VBD': 'Verb', 'VBG': 'Verb', 'VBN': 'Verb', 'VBP': 'Verb', 'VBZ': 'Verb', 'JJ': 'Adjective', 'JJR': 'Adjective', 'JJS': 'Adjective', 'RB': 'Adverb', 'RBR': 'Adverb', 'RBS': 'Adverb', 'IN': 'Preposition', 'PRP': 'Pronoun', 'PRP$': 'Pronoun', 'DT': 'Determiner', 'CC': 'Conjunction', 'CD': 'Numeral', 'UH': 'Interjection', 'FW': 'Foreign Word', 'TO': 'Particle', 'EX': 'Existential "there"', 'MD': 'Modal Auxiliary', 'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb', 'SYM': 'Symbol', 'RP': 'Particle', 'POS': 'Possessive ending', 'PDT': 'Predeterminer', 'LS': 'List item marker', 'NIL': 'Missing tag'}
|
|
546
546
|
|
|
547
547
|
ICTCLAS2008={'a': '形容词', 'ad': '副形词', 'ag': '形容词性语素', 'al': '形容词性惯用语', 'an': '名形词', 'b': '区别词', 'bl': '区别词性惯用语', 'c': '连词', 'cc': '并列连词', 'd': '副词', 'dg': '副词性语素', 'dl': '副词性惯用语', 'e': '叹词', 'ew': '句末标点', 'f': '方位词', 'h': '前缀', 'k': '后缀', 'm': '数词', 'mg': '数词性语素', 'mq': '数量词', 'n': '名词', 'ng': '名词性语素', 'nl': '名词性惯用语', 'nr': '汉语人名', 'nr1': '汉语姓氏', 'nr2': '汉语名字', 'nrf': '音译人名', 'nrj': '日语人名', 'ns': '地名', 'nsf': '音译地名', 'nt': '机构团体名', 'nz': '其他专名', 'o': '拟声词', 'p': '介词', 'pba': '介词“把”', 'pbei': '介词“被”', 'q': '量词', 'qt': '时量词', 'qv': '动量词', 'r': '代词', 'rg': '代词性语素', 'rr': '人称代词', 'ry': '疑问代词', 'rys': '处所疑问代词', 'ryt': '时间疑问代词', 'ryv': '谓词性疑问代词', 'rz': '指示代词', 'rzs': '处所指示代词', 'rzt': '时间指示代词', 'rzv': '谓词性指示代词', 's': '处所词', 't': '时间词', 'tg': '时间词性语素', 'u': '助词', 'udel': '的、底', 'ude2': '地', 'ude3': '得', 'udeng': '等、等等、云云', 'udh': '......的话', 'uguo': '过', 'ule': '了', 'ulian': '连', 'uls': '来讲、来说;而言、说来', 'usuo': '所', 'uyy': '一样、一般;似的、般', 'uzhe': '着', 'uzhi': '之', 'v': '动词', 'vd': '副动词', 'vf': '趋向动词', 'vg': '动词性语素', 'vi': '不及物动词', 'vl': '动词性惯用语', 'vn': '名动词', 'vshi': '动词“是”', 'vx': '形式动词', 'vyou': '动词“有”', 'w': '标点符号', 'wd': '逗号', 'wky': '右括号', 'wkz': '左括号', 'wm': '冒号', 'wn': '顿号', 'wp': '破折号', 'ws': '省略号', 'wy': '引号', 'x': '字符串', 'y': '语气词', 'z': '状态词'}
|
|
548
|
+
|
|
549
|
+
ICTCLAS3={
|
|
550
|
+
"n": "名词",
|
|
551
|
+
"nr": "人名",
|
|
552
|
+
"nr1": "汉语姓氏",
|
|
553
|
+
"nr2": "汉语名字",
|
|
554
|
+
"nrj": "日语人名",
|
|
555
|
+
"nrf": "音译人名",
|
|
556
|
+
"ns": "地名",
|
|
557
|
+
"nsf": "音译地名",
|
|
558
|
+
"nt": "机构团体名",
|
|
559
|
+
"nz": "其它专名",
|
|
560
|
+
"nl": "名词性惯用语",
|
|
561
|
+
"ng": "名词性语素",
|
|
562
|
+
"t": "时间词",
|
|
563
|
+
"tg": "时间词性语素",
|
|
564
|
+
"s": "处所词",
|
|
565
|
+
"f": "方位词",
|
|
566
|
+
"v": "动词",
|
|
567
|
+
"vd": "副动词",
|
|
568
|
+
"vn": "名动词",
|
|
569
|
+
"vshi": "动词“是”",
|
|
570
|
+
"vyou": "动词“有”",
|
|
571
|
+
"vf": "趋向动词",
|
|
572
|
+
"vx": "形式动词",
|
|
573
|
+
"vi": "不及物动词(内动词)",
|
|
574
|
+
"vl": "动词性惯用语",
|
|
575
|
+
"vg": "动词性语素",
|
|
576
|
+
"a": "形容词",
|
|
577
|
+
"ad": "副形词",
|
|
578
|
+
"an": "名形词",
|
|
579
|
+
"ag": "形容词性语素",
|
|
580
|
+
"al": "形容词性惯用语",
|
|
581
|
+
"b": "区别词",
|
|
582
|
+
"bl": "区别词性惯用语",
|
|
583
|
+
"z": "状态词",
|
|
584
|
+
"r": "代词",
|
|
585
|
+
"rr": "人称代词",
|
|
586
|
+
"rz": "指示代词",
|
|
587
|
+
"rzt": "时间指示代词",
|
|
588
|
+
"rzs": "处所指示代词",
|
|
589
|
+
"rzv": "谓词性指示代词",
|
|
590
|
+
"ry": "疑问代词",
|
|
591
|
+
"ryt": "时间疑问代词",
|
|
592
|
+
"rys": "处所疑问代词",
|
|
593
|
+
"ryv": "谓词性疑问代词",
|
|
594
|
+
"rg": "代词性语素",
|
|
595
|
+
"m": "数词",
|
|
596
|
+
"mq": "数量词",
|
|
597
|
+
"q": "量词",
|
|
598
|
+
"qv": "动量词",
|
|
599
|
+
"qt": "时量词",
|
|
600
|
+
"d": "副词",
|
|
601
|
+
"p": "介词",
|
|
602
|
+
"pba": "介词“把”",
|
|
603
|
+
"pbei": "介词“被”",
|
|
604
|
+
"c": "连词",
|
|
605
|
+
"cc": "并列连词",
|
|
606
|
+
"u": "助词",
|
|
607
|
+
"uzhe": "着",
|
|
608
|
+
"ule": "了 喽",
|
|
609
|
+
"uguo": "过",
|
|
610
|
+
"ude1": "的 底",
|
|
611
|
+
"ude2": "地",
|
|
612
|
+
"ude3": "得",
|
|
613
|
+
"usuo": "所",
|
|
614
|
+
"udeng": "等 等等 云云",
|
|
615
|
+
"uyy": "一样 一般 似的 般",
|
|
616
|
+
"udh": "的话",
|
|
617
|
+
"uls": "来讲 来说 而言 说来",
|
|
618
|
+
"uzhi": "之",
|
|
619
|
+
"ulian": "连 (“连小学生都会”)",
|
|
620
|
+
"e": "叹词",
|
|
621
|
+
"y": "语气词",
|
|
622
|
+
"o": "拟声词",
|
|
623
|
+
"h": "前缀",
|
|
624
|
+
"k": "后缀",
|
|
625
|
+
"x": "字符串",
|
|
626
|
+
"xe": "Email字符串",
|
|
627
|
+
"xs": "微博会话分隔符",
|
|
628
|
+
"xm": "表情符合",
|
|
629
|
+
"xu": "网址URL",
|
|
630
|
+
"w": "标点符号",
|
|
631
|
+
"wkz": "左括号,全角:( 〔 [ { 《 【 〖 〈 半角:( [ { <",
|
|
632
|
+
"wky": "右括号,全角:) 〕 ] } 》 】 〗 〉 半角: ) ] { >",
|
|
633
|
+
"wyz": "左引号,全角:“ ‘ 『",
|
|
634
|
+
"wyy": "右引号,全角:” ’ 』",
|
|
635
|
+
"wj": "句号,全角:。",
|
|
636
|
+
"ww": "问号,全角:? 半角:?",
|
|
637
|
+
"wt": "叹号,全角:! 半角:!",
|
|
638
|
+
"wd": "逗号,全角:, 半角:,",
|
|
639
|
+
"wf": "分号,全角:; 半角: ;",
|
|
640
|
+
"wn": "顿号,全角:、",
|
|
641
|
+
"wm": "冒号,全角:: 半角: :",
|
|
642
|
+
"ws": "省略号,全角:…… …",
|
|
643
|
+
"wp": "破折号,全角:―― -- ――- 半角:--- ----",
|
|
644
|
+
"wb": "百分号千分号,全角:% ‰ 半角:%",
|
|
645
|
+
"wh": "单位符号,全角:¥ $ £ ° ℃ 半角:$"
|
|
646
|
+
}
|
|
647
|
+
|
|
548
648
|
thulac_tags={'n': '名词', 'np': '人名', 'ns': '地名', 'ni': '机构名', 'nz': '其它专名', 'm': '数词', 'q': '量词', 'mq': '数量词', 't': '时间词', 'f': '方位词', 's': '处所词', 'v': '动词', 'a': '形容词', 'd': '副词', 'h': '前接成分', 'k': '后接成分', 'i': '习语', 'j': '简称', 'r': '代词', 'c': '连词', 'p': '介词', 'u': '助词', 'y': '语气助词', 'e': '叹词', 'o': '拟声词', 'g': '语素', 'w': '标点', 'x': '其它'}
|
|
549
649
|
|
|
550
650
|
LangCodes={'AA': ['阿法尔语', 'Afar'], 'AB': ['阿布哈兹语', 'Abkhaz'], 'AE': ['阿维斯陀语', 'Avestan'], 'AF': ['阿非利堪斯语', 'Afrikaans'], 'AK': ['阿坎语', 'Akan, Twi-Fante'], 'AM': ['阿姆哈拉语', 'Amharic'], 'AN': ['阿拉贡语', 'Aragonese'], 'AR': ['阿拉伯语', 'Arabic'], 'AS': ['阿萨姆语', 'Assamese'], 'AV': ['阿瓦尔语', 'Avaric'], 'AY': ['艾马拉语', 'Aymara'], 'AZ': ['阿塞拜疆语', 'Azerbaijani'], 'BA': ['巴什基尔语', 'Bashkir'], 'BE': ['白俄罗斯语', 'Belarusian'], 'BG': ['保加利亚语', 'Bulgarian'], 'BH': ['比哈尔语', 'Bihari'], 'BI': ['比斯拉玛语', 'Bislama'], 'BM': ['班巴拉语', 'Bambara'], 'BN': ['孟加拉语', 'Bengali'], 'BO': ['藏语', 'Tibetan Standard, Central Tibetan'], 'BR': ['布列塔尼语', 'Breton'], 'BS': ['波斯尼亚语', 'Bosnian'], 'CA': ['加泰隆语', 'Catalan;\xa0Valencian'], 'CE': ['车臣语', 'Chechen'], 'CH': ['查莫罗语', 'Chamorro'], 'CO': ['科西嘉语', 'Corsican'], 'CR': ['克里语', 'Cree'], 'CS': ['捷克语', 'Czech'], 'CU': ['教会斯拉夫语', 'Old Church Slavonic, Church Slavic, Church Slavonic, Old Bulgarian, Old Slavonic'], 'CV': ['楚瓦什语', 'Chuvash'], 'CY': ['威尔士语', 'Welsh'], 'DA': ['丹麦语', 'Danish'], 'DE': ['德语', 'German'], 'DV': ['迪维希语', 'Divehi; Dhivehi; Maldivian;'], 'DZ': ['不丹语', 'Dzongkha'], 'EE': ['埃维语', 'Ewe'], 'EL': ['现代希腊语', 'Greek, Modern'], 'EN': ['英语', 'English'], 'EO': ['世界语', 'Esperanto'], 'ES': ['西班牙语', 'Spanish; Castilian'], 'ET': ['爱沙尼亚语', 'Estonian'], 'EU': ['巴斯克语', 'Basque'], 'FA': ['波斯语', 'Persian'], 'FF': ['富拉语', 'Fula; Fulah; Pulaar; Pular'], 'FI': ['芬兰语', 'Finnish'], 'FJ': ['斐济语', 'Fijian'], 'FO': ['法罗斯语', 'Faroese'], 'FR': ['法语', 'French'], 'FY': ['弗里西亚语', 'Western Frisian'], 'GA': ['爱尔兰语', 'Irish'], 'GD': ['盖尔语(苏格兰语)', 'Scottish Gaelic; Gaelic'], 'GL': ['加利西亚语', 'Galician'], 'GN': ['瓜拉尼语', 'Guaraní'], 'GU': ['古吉拉特语', 'Gujarati'], 'GV': ['马恩岛语', 'Manx'], 'HA': ['豪萨语', 'Hausa'], 'HE': ['希伯来语', 'Hebrew\xa0(modern)'], 'HI': ['印地语', 'Hindi'], 'HO': ['希里莫图语', 'Hiri Motu'], 'HR': ['克罗地亚语', 'Croatian'], 'HT': ['海地克里奥尔语', 'Haitian; Haitian Creole'], 'HU': ['匈牙利语', 'Hungarian'], 'HY': ['亚美尼亚语', 'Armenian'], 'HZ': ['赫雷罗语', 'Herero'], 'I.E.': ['国际语E', 'Interlingue'], 'IA': ['国际语A', 'Interlingua'], 'ID': ['印尼语', 'Indonesian'], 'IG': ['伊博语', 'Igbo'], 'II': ['四川彝语(诺苏语)', 'Nuosu'], 'IK': ['依努庇克语', 'Inupiaq'], 'IO': ['伊多语', 'Ido'], 'IS': ['冰岛语', 'Icelandic'], 'IT': ['意大利语', 'Italian'], 'IU': ['伊努伊特语', 'Inuktitut'], 'JA': ['日语', 'Japanese'], 'JV': ['爪哇语', 'Javanese'], 'KA': ['格鲁吉亚语', 'Georgian'], 'KG': ['刚果语', 'Kongo'], 'KI': ['基库尤语', 'Kikuyu, Gikuyu'], 'KJ': ['夸尼亚玛语', 'Kwanyama, Kuanyama'], 'KK': ['哈萨克语', 'Kazakh'], 'KL': ['格陵兰语', 'Kalaallisut, Greenlandic'], 'KM': ['高棉语', 'Khmer, Cambodian'], 'KN': ['坎纳达语', 'Kannada'], 'KO': ['朝鲜语', 'Korean'], 'KR': ['卡努里语', 'Kanuri'], 'KS': ['克什米尔语', 'Kashmiri'], 'KU': ['库尔德语', 'Kurdish'], 'KV': ['科米语', 'Komi'], 'KW': ['康沃尔语', 'Cornish'], 'KY': ['吉尔吉斯语', 'Kirghiz, Kyrgyz'], 'LA': ['拉丁语', 'Latin'], 'LB': ['卢森堡语', 'Luxembourgish, Letzeburgesch'], 'LG': ['干达语', 'Luganda'], 'LI': ['林堡语', 'Limburgish, Limburgan, Limburger'], 'LN': ['林加拉语', 'Lingala'], 'LO': ['老挝语', 'Lao'], 'LT': ['立陶宛语', 'Lithuanian'], 'LU': ['卢巴—加丹加语', 'Luba-Katanga'], 'LV': ['拉脱维亚语', 'Latvian'], 'MG': ['马达加斯加语', 'Malagasy'], 'MH': ['马绍尔语', 'Marshallese'], 'MI': ['毛利语', 'Māori'], 'MK': ['马其顿语', 'Macedonian'], 'ML': ['马拉亚拉姆语', 'Malayalam'], 'MN': ['蒙古语', 'Mongolian'], 'MR': ['马拉提语', 'Marathi (Marāṭhī)'], 'MS': ['马来语', 'Malay'], 'MT': ['马耳他语', 'Maltese'], 'MY': ['缅甸语', 'Burmese'], 'NA': ['瑙鲁语', 'Nauru'], 'NB': ['挪威布克摩尔语', 'Norwegian Bokmål'], 'ND': ['北恩德贝勒语', 'North Ndebele'], 'NE': ['尼泊尔语', 'Nepali'], 'NG': ['恩敦加语', 'Ndonga'], 'NL': ['荷兰语', 'Dutch'], 'NN': ['尼诺斯克挪威语', 'Norwegian Nynorsk'], 'NO': ['挪威语', 'Norwegian'], 'NR': ['南恩德贝勒语', 'South Ndebele'], 'NV': ['纳瓦霍语', 'Navajo, Navaho'], 'NY': ['尼扬贾语', 'Chichewa; Chewa; Nyanja'], 'OC': ['普罗旺斯语', 'Occitan'], 'OJ': ['奥吉布瓦语', 'Ojibwe, Ojibwa'], 'OM': ['阿芳•奥洛莫语', 'Oromo'], 'OR': ['奥利亚语', 'Oriya'], 'OS': ['奥塞梯语', 'Ossetian, Ossetic'], 'PA': ['旁遮普语', 'Panjabi, Punjabi'], 'PI': ['巴利语', 'Pāli'], 'PL': ['波兰语', 'Polish'], 'PS': ['普什图语', 'Pashto, Pushto'], 'PT': ['葡萄牙语', 'Portuguese'], 'QU': ['凯楚亚语', 'Quechua'], 'RM': ['罗曼语', 'Romansh'], 'RN': ['基隆迪语', 'Kirundi'], 'RO': ['罗马尼亚语', 'Romanian,\xa0Moldavian, Moldovan'], 'RU': ['俄语', 'Russian'], 'RW': ['基尼阿万达语', 'Kinyarwanda'], 'SA': ['梵语', 'Sanskrit (Saṁskṛta)'], 'SC': ['撒丁语', 'Sardinian'], 'SD': ['信德语', 'Sindhi'], 'SE': ['北萨摩斯语', 'Northern Sami'], 'SG': ['桑戈语', 'Sango'], 'SI': ['僧加罗语', 'Sinhala, Sinhalese'], 'SK': ['斯洛伐克语', 'Slovak'], 'SL': ['斯洛文尼亚语', 'Slovene'], 'SM': ['萨摩亚语', 'Samoan'], 'SN': ['绍纳语', 'Shona'], 'SO': ['索马里语', 'Somali'], 'SQ': ['阿尔巴尼亚语', 'Albanian'], 'SR': ['塞尔维亚语', 'Serbian'], 'SS': ['塞斯瓦特语', 'Swati'], 'ST': ['南索托语', 'Southern Sotho'], 'SU': ['巽他语', 'Sundanese'], 'SV': ['瑞典语', 'Swedish'], 'SW': ['斯瓦希里语', 'Swahili'], 'TA': ['泰米尔语', 'Tamil'], 'TE': ['泰卢固语', 'Telugu'], 'TG': ['塔吉克语', 'Tajik'], 'TH': ['泰语', 'Thai'], 'TI': ['提格里尼亚语', 'Tigrinya'], 'TK': ['土库曼语', 'Turkmen'], 'TL': ['他加禄语', 'Tagalog'], 'TN': ['塞茨瓦纳语', 'Tswana'], 'TO': ['汤加语', 'Tongan'], 'TR': ['土耳其语', 'Turkish'], 'TS': ['宗加语', 'Tsonga'], 'TT': ['塔塔尔语', 'Tatar'], 'TW': ['特威语', 'Twi'], 'TY': ['塔希提语', 'Tahitian'], 'UG': ['维吾尔语', 'Uighur, Uyghur'], 'UK': ['乌克兰语', 'Ukrainian'], 'UR': ['乌尔都语', 'Urdu'], 'UZ': ['乌兹别克语', 'Uzbek'], 'VE': ['文达语', 'Venda'], 'VI': ['越南语', 'Vietnamese'], 'VO': ['沃拉普克语', 'Volapük'], 'WA': ['瓦隆语', 'Walloon'], 'WO': ['沃洛夫语', 'Wolof'], 'XH': ['科萨语', 'Xhosa'], 'YI': ['依地语', 'Yiddish'], 'YO': ['约鲁巴语', 'Yoruba'], 'ZA': ['壮语', 'Zhuang, Chuang'], 'ZH': ['汉语(中文)', 'Chinese'], 'ZU': ['祖鲁语', 'Zulu']}
|
|
@@ -3900,4 +4000,30 @@ def resize_image(input_image_path, output_image_path, max_size_kb):
|
|
|
3900
4000
|
if size <= max_size_kb:
|
|
3901
4001
|
print(f"Image resized successfully to {size} KB.")
|
|
3902
4002
|
else:
|
|
3903
|
-
print("Could not reduce the image size below 2MB.")
|
|
4003
|
+
print("Could not reduce the image size below 2MB.")
|
|
4004
|
+
|
|
4005
|
+
import base64
|
|
4006
|
+
def convert_image_to_url(image_path: str) -> str:
|
|
4007
|
+
"""
|
|
4008
|
+
Convert an image file to a base64 encoded URL format.
|
|
4009
|
+
|
|
4010
|
+
:param image_path: Path to the image file.
|
|
4011
|
+
:return: A string representing the image in the required URL format.
|
|
4012
|
+
"""
|
|
4013
|
+
# Check if the file exists
|
|
4014
|
+
if not os.path.isfile(image_path):
|
|
4015
|
+
raise FileNotFoundError(f"The file {image_path} does not exist.")
|
|
4016
|
+
|
|
4017
|
+
# Open and read the image file in binary mode
|
|
4018
|
+
with open(image_path, "rb") as f:
|
|
4019
|
+
image_data = f.read()
|
|
4020
|
+
|
|
4021
|
+
# Extract the file extension and convert it to base64
|
|
4022
|
+
file_extension = os.path.splitext(image_path)[1][1:]
|
|
4023
|
+
base64_image_data = base64.b64encode(image_data).decode('utf-8')
|
|
4024
|
+
|
|
4025
|
+
# Create the image URL
|
|
4026
|
+
image_url = f"data:image/{file_extension};base64,{base64_image_data}"
|
|
4027
|
+
|
|
4028
|
+
return image_url
|
|
4029
|
+
|
PgsFile/__init__.py
CHANGED
|
@@ -36,7 +36,7 @@ from .PgsFile import get_system_info
|
|
|
36
36
|
# 6. Data cleaning
|
|
37
37
|
from .PgsFile import BigPunctuation, StopTags, Special, yhd
|
|
38
38
|
from .PgsFile import ZhStopWords, EnPunctuation, get_stopwords, get_CET_dics, get_BNC_dic
|
|
39
|
-
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words, usua_tag_set, claws_c7_tags, spacy_pos_tags
|
|
39
|
+
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, ICTCLAS3, LangCodes, pgs_abbres_words, usua_tag_set, claws_c7_tags, spacy_pos_tags
|
|
40
40
|
from .PgsFile import check_contain_chinese, check_contain_number
|
|
41
41
|
from .PgsFile import replace_chinese_punctuation_with_english
|
|
42
42
|
from .PgsFile import replace_english_punctuation_with_chinese
|
|
@@ -64,5 +64,6 @@ from .PgsFile import timeit
|
|
|
64
64
|
from .PgsFile import replace_white_with_transparency
|
|
65
65
|
from .PgsFile import simhei_default_font_path_MacOS_Windows
|
|
66
66
|
from .PgsFile import get_font_path, resize_image
|
|
67
|
+
from .PgsFile import convert_image_to_url
|
|
67
68
|
|
|
68
69
|
name = "PgsFile"
|
PgsFile/models/NLPIR.user
CHANGED
|
Binary file
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
Your task is to process the given Chinese sentence or phrase through the following steps:
|
|
2
|
+
1. Tokenize the Chinese sentence or phrase into individual Chinese words.
|
|
3
|
+
2. Learn the Part-of-Speech (POS) taggers and their meanings from the provided {tagset}.
|
|
4
|
+
3. Assign each word a POS tag from the provided tagset. If unsure, use "UNK" for unknown words.
|
|
5
|
+
|
|
6
|
+
Return the results in the specified format: a word with its POS tag connected by a single slash, like this {example}.
|
|
7
|
+
Here is the sentence or phrase to process: {sent}.
|
|
8
|
+
Do not include any explanations or additional text in the output.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Perform OCR on the image and return all the extracted text, including both running paragraphs and tables, without any additional explanation. Ensure that tables are kept in their raw format as accurately as possible, without any content or arrangement changes.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PgsFile
|
|
3
|
-
Version: 0.4.
|
|
4
|
-
Summary: This module simplifies Python package management, script execution, file handling, web scraping, and multimedia downloads. The module supports (LLM-based) NLP tasks such as tokenization, lemmatization, POS tagging, NER, keywords extraction, dependency parsing, MDD, WSD, LIWC, and MIP analysis. It also generates word lists, and plots data, aiding literary students. Ideal for scraping data, cleaning text, and analyzing language, it offers user-friendly tools to streamline workflows.
|
|
5
|
-
Home-page: https://mp.weixin.qq.com/s/
|
|
3
|
+
Version: 0.4.2
|
|
4
|
+
Summary: This module simplifies Python package management, script execution, file handling, web scraping, and multimedia downloads. The module supports (LLM-based) NLP tasks such as OCR, tokenization, lemmatization, POS tagging, NER, keywords extraction, dependency parsing, MDD, WSD, LIWC, and MIP analysis. It also generates word lists, and plots data, aiding literary students. Ideal for scraping data, cleaning text, and analyzing language, it offers user-friendly tools to streamline workflows.
|
|
5
|
+
Home-page: https://mp.weixin.qq.com/s/lWMkYDWQMjBJNKY2vMYTpw
|
|
6
6
|
Author: Pan Guisheng
|
|
7
7
|
Author-email: 895284504@qq.com
|
|
8
8
|
License: Educational free
|
|
@@ -34,7 +34,7 @@ Key Features:
|
|
|
34
34
|
4. **Data Storage:** Write and append data to text files, Excel, JSON, and JSON lines.
|
|
35
35
|
5. **File and Folder Processing:** Manage file paths, create directories, move or copy files, and search for files with specific keywords.
|
|
36
36
|
6. **Data Cleaning:** Clean text, handle punctuation, remove stopwords, and prepare data for analysis, utilizing valuable corpora and dictionaries such as CET-4/6 vocabulary and BNC-COCA word lists.
|
|
37
|
-
7. **NLP:** Perform word tokenization, lemmatization, POS tagging, NER, dependency parsing, keywords extraction, MDD, WSD, LIWC, and MIP analysis using prepared LLM prompts.
|
|
37
|
+
7. **NLP:** Perform OCR, word tokenization, lemmatization, POS tagging, NER, dependency parsing, keywords extraction, MDD, WSD, LIWC, and MIP analysis using prepared LLM prompts.
|
|
38
38
|
8. **Math Operations:** Format numbers, convert decimals to percentages, and validate data.
|
|
39
39
|
9. **Visualization:** Process images (e.g., make white pixels transparent, resize images) and manage fonts for rendering text.
|
|
40
40
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256=
|
|
1
|
+
PgsFile/PgsFile.py,sha256=piiVyMNNZoXLSuxbcg-5wXJYUgnEAirQUs4GEq7Neaw,165583
|
|
2
|
+
PgsFile/__init__.py,sha256=Qg7xt9wR83ySrS8z0Fy6xn5JttDLoEPLIhpnj7sWGdc,3553
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2571,7 +2571,7 @@ PgsFile/Corpora/Stopwords/turkish.txt,sha256=uGUvjEm2GR8PuVY_JeHNxhD7cWlNlF7vc3V
|
|
|
2571
2571
|
PgsFile/Corpora/Stopwords/ukrainian.txt,sha256=fEzWLTwnWJriILkO-5jSfE2SpqY-GPf_kR4zid3MFUI,4131
|
|
2572
2572
|
PgsFile/Corpora/Stopwords/vietnamese.txt,sha256=88yRtVMaRSFqas1iGGa6kOGDCZTgtzRPmR3q9dHshdc,20485
|
|
2573
2573
|
PgsFile/Corpora/Terminology/Chinese_Thought.json,sha256=CdkuF2wLaDC5V3sRefcU1RZwXm4-wTZ-Qfk8r7gsu8I,2301866
|
|
2574
|
-
PgsFile/models/NLPIR.user,sha256=
|
|
2574
|
+
PgsFile/models/NLPIR.user,sha256=jLOqi6EhYj9p7eGcTWECcmElwsvNWaUBTSkMeBsO_S4,3356
|
|
2575
2575
|
PgsFile/models/model_reviews2.2.bin,sha256=D6uL8KZIxD0rfWjH0kYEb7z_HE4aTJXpj82HzsCOpuk,1943196
|
|
2576
2576
|
PgsFile/models/model_reviews_ReadMe.txt,sha256=Q9uLJwudMmsTKfd11l1tOcIP8lwsemIwnAVJG_3SYjU,11433
|
|
2577
2577
|
PgsFile/models/dics/BNC_COCA_lists.xlsx,sha256=ua5iQzEf5UQpsCezbsliNF6e_PYHIHGSJUjn9MyEEks,1229313
|
|
@@ -2586,8 +2586,10 @@ PgsFile/models/fonts/博洋行书3500.TTF,sha256=VrgeHr8cgOL6JD05QyuD9ZSyw4J2aIV
|
|
|
2586
2586
|
PgsFile/models/fonts/陆柬之行书字体.ttf,sha256=Zpd4Z7E9w-Qy74yklXHk4vM7HOtHuQgllvygxZZ1Hvs,1247288
|
|
2587
2587
|
PgsFile/models/prompts/1. MIP prompt.txt,sha256=4lHlHmleayRytqr1n9jtt6vn1rQvyf4BKeThpbwI8o8,1638
|
|
2588
2588
|
PgsFile/models/prompts/2. WSD prompt.txt,sha256=o-ZFtCRUCDrXgm040WTQch9v2Y_r2SIlrZaquilJjgQ,2348
|
|
2589
|
-
PgsFile
|
|
2590
|
-
PgsFile
|
|
2591
|
-
PgsFile-0.4.
|
|
2592
|
-
PgsFile-0.4.
|
|
2593
|
-
PgsFile-0.4.
|
|
2589
|
+
PgsFile/models/prompts/3. ICTCLAS Prompt.txt,sha256=VFn6N_JViAbyy9NazA8gjX6SGo5mgBcZOf95aC9JB84,592
|
|
2590
|
+
PgsFile/models/prompts/4. OCR prompt.txt,sha256=YxUQ2IlE52k0fcBnGsuOHqWAmfiEmIu6iRz5zecQ8dk,260
|
|
2591
|
+
PgsFile-0.4.2.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2592
|
+
PgsFile-0.4.2.dist-info/METADATA,sha256=ETpekk4cnT1bC3ZCTlCT7U7mldyOUNO1AW73-L1CUZw,2967
|
|
2593
|
+
PgsFile-0.4.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2594
|
+
PgsFile-0.4.2.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2595
|
+
PgsFile-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|