PgsFile 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +58 -0
- PgsFile/__init__.py +2 -1
- {PgsFile-0.2.0.dist-info → PgsFile-0.2.1.dist-info}/METADATA +3 -2
- {PgsFile-0.2.0.dist-info → PgsFile-0.2.1.dist-info}/RECORD +7 -7
- {PgsFile-0.2.0.dist-info → PgsFile-0.2.1.dist-info}/LICENSE +0 -0
- {PgsFile-0.2.0.dist-info → PgsFile-0.2.1.dist-info}/WHEEL +0 -0
- {PgsFile-0.2.0.dist-info → PgsFile-0.2.1.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -591,6 +591,24 @@ def remove_empty_folders(folder_path):
|
|
|
591
591
|
print(delet_root)
|
|
592
592
|
print("Folders removed: ",len(delet_root))
|
|
593
593
|
|
|
594
|
+
def concatenate_excel_files(directory_path, output_file):
|
|
595
|
+
# List to hold DataFrames
|
|
596
|
+
dataframes = []
|
|
597
|
+
|
|
598
|
+
# Loop through all files in the directory
|
|
599
|
+
for filename in os.listdir(directory_path):
|
|
600
|
+
if filename.endswith('.xlsx') or filename.endswith('.xls'):
|
|
601
|
+
file_path = os.path.join(directory_path, filename)
|
|
602
|
+
df = pd.read_excel(file_path)
|
|
603
|
+
dataframes.append(df)
|
|
604
|
+
|
|
605
|
+
# Concatenate all DataFrames into a single DataFrame
|
|
606
|
+
combined_df = pd.concat(dataframes, ignore_index=True)
|
|
607
|
+
|
|
608
|
+
# Write the combined DataFrame to a new Excel file
|
|
609
|
+
combined_df.to_excel(output_file, index=False)
|
|
610
|
+
print(f"Combined Excel file saved as {output_file}")
|
|
611
|
+
|
|
594
612
|
def remove_empty_lines(folder_path):
|
|
595
613
|
files=FilePath(folder_path)
|
|
596
614
|
for file in files:
|
|
@@ -735,6 +753,46 @@ def cs1(text):
|
|
|
735
753
|
sentences=sentences
|
|
736
754
|
return sentences
|
|
737
755
|
|
|
756
|
+
def word_tokenize(text, pos_tagged=False):
|
|
757
|
+
'''
|
|
758
|
+
Parameters
|
|
759
|
+
----------
|
|
760
|
+
text : TYPE, string like: "无独有偶,这个消息如晴天霹雳,霍尔姆斯听到后不知所措。中国电影家协会和中国作家协会,中国翻译协会是做慈善的。"
|
|
761
|
+
DESCRIPTION.
|
|
762
|
+
pos_tagged : TYPE, optional
|
|
763
|
+
DESCRIPTION. The default is False.
|
|
764
|
+
|
|
765
|
+
Returns
|
|
766
|
+
-------
|
|
767
|
+
words : TYPE, list like: ['无独有偶', ',', '这个', '消息', '如', '晴天霹雳', ',', '霍尔姆斯', '听到', '后', '不知所措', '。', '中国', '电影', '家', '协会', '和', '中国', '作家', '协会', ',', '中国', '翻译', '协会', '是', '做', '慈善', '的', '。', '']
|
|
768
|
+
DESCRIPTION.
|
|
769
|
+
|
|
770
|
+
'''
|
|
771
|
+
words=None
|
|
772
|
+
try:
|
|
773
|
+
try:
|
|
774
|
+
from nlpir import ictclas #调用中科院分词器ICTCLAS
|
|
775
|
+
except Exception as err:
|
|
776
|
+
print("installing nlpir/ICTCLAS...")
|
|
777
|
+
from PgsFile import install_package as ip
|
|
778
|
+
ip("nlpir-python")
|
|
779
|
+
|
|
780
|
+
from nlpir import ictclas
|
|
781
|
+
if pos_tagged is False:
|
|
782
|
+
words=ictclas.segment(text, pos_tagged=False)
|
|
783
|
+
else:
|
|
784
|
+
words=ictclas.segment(text, pos_tagged=True)
|
|
785
|
+
except Exception as err:
|
|
786
|
+
if "expired" in str(err):
|
|
787
|
+
try:
|
|
788
|
+
from nlpir import tools
|
|
789
|
+
tools.update_license()
|
|
790
|
+
except Exception as err2:
|
|
791
|
+
print("You need a VPN to try this service!", err2)
|
|
792
|
+
else:
|
|
793
|
+
print(err)
|
|
794
|
+
return words
|
|
795
|
+
|
|
738
796
|
def pad_sequence(
|
|
739
797
|
sequence,
|
|
740
798
|
n,
|
PgsFile/__init__.py
CHANGED
|
@@ -23,6 +23,7 @@ from .PgsFile import get_subfolder_path
|
|
|
23
23
|
from .PgsFile import makedirec, makefile
|
|
24
24
|
from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
|
|
25
25
|
from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
|
|
26
|
+
from .PgsFile import concatenate_excel_files
|
|
26
27
|
|
|
27
28
|
# 6. Data cleaning
|
|
28
29
|
from .PgsFile import BigPunctuation, StopTags, Special, yhd
|
|
@@ -38,7 +39,7 @@ from .PgsFile import extract_chinese_punctuation, generate_password, sort_string
|
|
|
38
39
|
from .PgsFile import strQ2B_raw, strQ2B_words
|
|
39
40
|
from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
|
|
40
41
|
from .PgsFile import word_list, batch_word_list
|
|
41
|
-
from .PgsFile import cs, cs1, sent_tokenize
|
|
42
|
+
from .PgsFile import cs, cs1, sent_tokenize, word_tokenize
|
|
42
43
|
|
|
43
44
|
# 8. Maths
|
|
44
45
|
from .PgsFile import len_rows, check_empty_cells
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PgsFile
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, NLP tasks, and word list generation for literary students, making it more accessible and convenient to use.
|
|
5
5
|
Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
|
|
6
6
|
Author: Pan Guisheng
|
|
7
7
|
Author-email: 895284504@qq.com
|
|
@@ -20,6 +20,7 @@ Requires-Dist: fake-useragent
|
|
|
20
20
|
Requires-Dist: lxml
|
|
21
21
|
Requires-Dist: pimht
|
|
22
22
|
Requires-Dist: pysbd
|
|
23
|
+
Requires-Dist: nlpir-python
|
|
23
24
|
|
|
24
25
|
Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
|
|
25
26
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
PgsFile/PgsFile.py,sha256=
|
|
2
|
-
PgsFile/__init__.py,sha256
|
|
1
|
+
PgsFile/PgsFile.py,sha256=o6J3tipdCBkA0JvUz6vckZH_YlAgQmlAGQOMKdXb95M,82975
|
|
2
|
+
PgsFile/__init__.py,sha256=-Vy1SIh-BYopiEan-EjBtwqZsNteNrOqkws7hUj1d2w,2378
|
|
3
3
|
PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
|
|
4
4
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
|
|
5
5
|
PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
|
|
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
|
|
|
2618
2618
|
PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
|
|
2619
2619
|
PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
|
|
2620
2620
|
PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
|
|
2621
|
-
PgsFile-0.2.
|
|
2622
|
-
PgsFile-0.2.
|
|
2623
|
-
PgsFile-0.2.
|
|
2624
|
-
PgsFile-0.2.
|
|
2625
|
-
PgsFile-0.2.
|
|
2621
|
+
PgsFile-0.2.1.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
|
|
2622
|
+
PgsFile-0.2.1.dist-info/METADATA,sha256=PCrjMATNQrsqPfsVVC15cmOinp-o3HYR88kLMcsn2lA,4999
|
|
2623
|
+
PgsFile-0.2.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
2624
|
+
PgsFile-0.2.1.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
|
|
2625
|
+
PgsFile-0.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|