PgsFile 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

PgsFile/PgsFile.py CHANGED
@@ -591,6 +591,24 @@ def remove_empty_folders(folder_path):
591
591
  print(delet_root)
592
592
  print("Folders removed: ",len(delet_root))
593
593
 
594
+ def concatenate_excel_files(directory_path, output_file):
595
+ # List to hold DataFrames
596
+ dataframes = []
597
+
598
+ # Loop through all files in the directory
599
+ for filename in os.listdir(directory_path):
600
+ if filename.endswith('.xlsx') or filename.endswith('.xls'):
601
+ file_path = os.path.join(directory_path, filename)
602
+ df = pd.read_excel(file_path)
603
+ dataframes.append(df)
604
+
605
+ # Concatenate all DataFrames into a single DataFrame
606
+ combined_df = pd.concat(dataframes, ignore_index=True)
607
+
608
+ # Write the combined DataFrame to a new Excel file
609
+ combined_df.to_excel(output_file, index=False)
610
+ print(f"Combined Excel file saved as {output_file}")
611
+
594
612
  def remove_empty_lines(folder_path):
595
613
  files=FilePath(folder_path)
596
614
  for file in files:
@@ -735,6 +753,46 @@ def cs1(text):
735
753
  sentences=sentences
736
754
  return sentences
737
755
 
756
+ def word_tokenize(text, pos_tagged=False):
757
+ '''
758
+ Parameters
759
+ ----------
760
+ text : TYPE, string like: "无独有偶,这个消息如晴天霹雳,霍尔姆斯听到后不知所措。中国电影家协会和中国作家协会,中国翻译协会是做慈善的。"
761
+ DESCRIPTION.
762
+ pos_tagged : TYPE, optional
763
+ DESCRIPTION. The default is False.
764
+
765
+ Returns
766
+ -------
767
+ words : TYPE, list like: ['无独有偶', ',', '这个', '消息', '如', '晴天霹雳', ',', '霍尔姆斯', '听到', '后', '不知所措', '。', '中国', '电影', '家', '协会', '和', '中国', '作家', '协会', ',', '中国', '翻译', '协会', '是', '做', '慈善', '的', '。', '']
768
+ DESCRIPTION.
769
+
770
+ '''
771
+ words=None
772
+ try:
773
+ try:
774
+ from nlpir import ictclas #调用中科院分词器ICTCLAS
775
+ except Exception as err:
776
+ print("installing nlpir/ICTCLAS...")
777
+ from PgsFile import install_package as ip
778
+ ip("nlpir-python")
779
+
780
+ from nlpir import ictclas
781
+ if pos_tagged is False:
782
+ words=ictclas.segment(text, pos_tagged=False)
783
+ else:
784
+ words=ictclas.segment(text, pos_tagged=True)
785
+ except Exception as err:
786
+ if "expired" in str(err):
787
+ try:
788
+ from nlpir import tools
789
+ tools.update_license()
790
+ except Exception as err2:
791
+ print("You need a VPN to try this service!", err2)
792
+ else:
793
+ print(err)
794
+ return words
795
+
738
796
  def pad_sequence(
739
797
  sequence,
740
798
  n,
PgsFile/__init__.py CHANGED
@@ -23,6 +23,7 @@ from .PgsFile import get_subfolder_path
23
23
  from .PgsFile import makedirec, makefile
24
24
  from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
25
25
  from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
26
+ from .PgsFile import concatenate_excel_files
26
27
 
27
28
  # 6. Data cleaning
28
29
  from .PgsFile import BigPunctuation, StopTags, Special, yhd
@@ -38,7 +39,7 @@ from .PgsFile import extract_chinese_punctuation, generate_password, sort_string
38
39
  from .PgsFile import strQ2B_raw, strQ2B_words
39
40
  from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
40
41
  from .PgsFile import word_list, batch_word_list
41
- from .PgsFile import cs, cs1, sent_tokenize
42
+ from .PgsFile import cs, cs1, sent_tokenize, word_tokenize
42
43
 
43
44
  # 8. Maths
44
45
  from .PgsFile import len_rows, check_empty_cells
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PgsFile
3
- Version: 0.2.0
4
- Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
3
+ Version: 0.2.1
4
+ Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, NLP tasks, and word list generation for literary students, making it more accessible and convenient to use.
5
5
  Home-page: https://mp.weixin.qq.com/s/12-KVLfaPszoZkCxuRd-nQ?token=1589547443&lang=zh_CN
6
6
  Author: Pan Guisheng
7
7
  Author-email: 895284504@qq.com
@@ -20,6 +20,7 @@ Requires-Dist: fake-useragent
20
20
  Requires-Dist: lxml
21
21
  Requires-Dist: pimht
22
22
  Requires-Dist: pysbd
23
+ Requires-Dist: nlpir-python
23
24
 
24
25
  Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
25
26
 
@@ -1,5 +1,5 @@
1
- PgsFile/PgsFile.py,sha256=1-PR4NO2FF7lO8_lQmSP_VLVs6pV8jC_5_nOdGvwuhk,80684
2
- PgsFile/__init__.py,sha256=Tbr3MaFP7ZqhwVaYAnBJx7UBJhM4c884F8sFMQjfzXU,2317
1
+ PgsFile/PgsFile.py,sha256=o6J3tipdCBkA0JvUz6vckZH_YlAgQmlAGQOMKdXb95M,82975
2
+ PgsFile/__init__.py,sha256=-Vy1SIh-BYopiEan-EjBtwqZsNteNrOqkws7hUj1d2w,2378
3
3
  PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
4
4
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
5
5
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
2618
2618
  PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
2619
2619
  PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
2620
2620
  PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
2621
- PgsFile-0.2.0.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
- PgsFile-0.2.0.dist-info/METADATA,sha256=u-nzDLhOIJYZ-nOp9FpE5EFWsjW3683viOehOfqQIvs,4959
2623
- PgsFile-0.2.0.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
- PgsFile-0.2.0.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
- PgsFile-0.2.0.dist-info/RECORD,,
2621
+ PgsFile-0.2.1.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
+ PgsFile-0.2.1.dist-info/METADATA,sha256=PCrjMATNQrsqPfsVVC15cmOinp-o3HYR88kLMcsn2lA,4999
2623
+ PgsFile-0.2.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
+ PgsFile-0.2.1.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
+ PgsFile-0.2.1.dist-info/RECORD,,