PgsFile 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

PgsFile/PgsFile.py CHANGED
@@ -358,6 +358,20 @@ def write_to_json_lines(json_path,my_json_data):
358
358
  file.write(json_str + '\n')
359
359
  file.close()
360
360
 
361
+
362
+ # Function to append a dictionary to a JSON file
363
+ def append_dict_to_json(file_path, data_dict):
364
+ try:
365
+ import json
366
+ with open(file_path, 'a', encoding="utf-8") as file:
367
+ json_string = json.dumps(data_dict, ensure_ascii=False)
368
+ file.write(json_string + '\n')
369
+ # print(f"Dictionary appended to {file_path}")
370
+ except IOError as e:
371
+ print(f"An I/O error occurred: {e}")
372
+ except Exception as e:
373
+ print(f"An error occurred: {e}")
374
+
361
375
  def FilePath(root):
362
376
  '''读取所有文件,列出每个文件的路径'''
363
377
  import os
@@ -553,19 +567,6 @@ def next_folder_names(folder):
553
567
  folder_namelist=next(os.walk(folder))[1]
554
568
  return folder_namelist
555
569
 
556
- def get_package_path(package_name):
557
- import site
558
- import os
559
- package_paths=site.getsitepackages()
560
- package_path=None
561
- for path in package_paths:
562
- if os.path.exists(os.path.join(path, package_name)):
563
- package_path=os.path.join(path, package_name)
564
- break
565
-
566
- if package_path is None:
567
- raise ModuleNotFoundError(f"Package '{package_name}' not found.")
568
- return package_path
569
570
 
570
571
  def remove_empty_txts(folder_path):
571
572
  import os
@@ -631,77 +632,6 @@ def remove_empty_last_line(folder_path):
631
632
  f2.write(lines[i])
632
633
  f2.close()
633
634
  print(end_empty_files,str(len(end_empty_files))+" files found with last line empty!")
634
-
635
- corpus_root=get_package_path('PgsFile')+"/Corpora"
636
- def extract_stopwords(lang=None):
637
- '''
638
- Parameters
639
- ----------
640
- lang : TYPE, optional string
641
- DESCRIPTION. The default is None.
642
- lang="english"; lang="chinese" etc.
643
-
644
- Returns
645
- -------
646
- contents : TYPE list
647
- DESCRIPTION. ["'ll", "'tis", "'twas", "'ve", '10', '39', 'a', "a's", 'able', 'ableabout', 'about', 'above', 'abroad', 'abst', 'accordance', 'according']
648
-
649
- '''
650
- import os
651
- # Check if the folder exists
652
- if not os.path.isdir(corpus_root):
653
- print(f"Error: The folder '{corpus_root}' does not exist.")
654
- return None
655
-
656
- if lang is None:
657
- language="english"
658
- else:
659
- language=lang
660
- file_name=language+".txt"
661
-
662
- # Traverse the folder recursively
663
- for root, dirs, files in os.walk(corpus_root):
664
- # Check if the text file exists in the current folder
665
- if file_name in files:
666
- # Construct the full path to the text file
667
- file_path=os.path.join(root, file_name)
668
- # Read the contents of the text file
669
- contents=[line.strip() for line in get_data_lines(file_path)]
670
- return contents
671
-
672
- # If the text file doesn't exist in any folder, print an error message
673
- print(f"Error: The file '{file_name}' does not exist in the folder '{corpus_root}' or its sub-folders.")
674
- return None
675
-
676
- pickle_root=get_package_path('PgsFile')+"/models"
677
- def load_pickle_data(lang=None):
678
- '''
679
- Parameters
680
- ----------
681
- lang : TYPE, optional
682
- DESCRIPTION. The default is None.
683
- lang="english"; lang="chinese" etc.
684
- Returns
685
- -------
686
- data : TYPE
687
- DESCRIPTION.
688
-
689
- '''
690
- import pickle
691
- files=FilePath(pickle_root)
692
- if lang is None:
693
- language="english"
694
- else:
695
- language=lang
696
- file_path=""
697
- for file in files:
698
- if language in FileName(file):
699
- file_path=file
700
- with open(file_path, 'rb') as handle:
701
- data=pickle.load(handle)
702
- return data
703
-
704
-
705
635
 
706
636
  def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
707
637
  """
@@ -731,8 +661,14 @@ def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
731
661
 
732
662
  # Standard sentence tokenizer.
733
663
  def sent_tokenize(text, lang=None):
734
- tokenizer=load_pickle_data(lang)
735
- return tokenizer.tokenize(text)
664
+ import pysbd
665
+ if lang is None:
666
+ lang="en"
667
+ else:
668
+ lang=lang
669
+ seg = pysbd.Segmenter(language=lang, clean=False)
670
+ sent_list = seg.segment(text)
671
+ return sent_list
736
672
 
737
673
  def cs(para):
738
674
  """
@@ -757,18 +693,7 @@ def cs(para):
757
693
  return paras
758
694
 
759
695
 
760
- def cs1(para): #英文分句
761
- """
762
- #英文分句
763
- using nltk model
764
- ---------
765
- Returns
766
- list
767
- """
768
- return sent_tokenize(para)
769
-
770
-
771
- def cs2(text):
696
+ def cs1(text):
772
697
  """
773
698
  #英文分句
774
699
  using regular expression
@@ -811,7 +736,7 @@ def cs2(text):
811
736
  sentences=sentences[:-1]
812
737
  sentences=[s.strip() for s in sentences]
813
738
  if len(sentences)==0:
814
- sentences=cs1(text)
739
+ sentences=sent_tokenize(text)
815
740
  else:
816
741
  sentences=sentences
817
742
  return sentences
PgsFile/__init__.py CHANGED
@@ -12,18 +12,18 @@ from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspe
12
12
  from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
13
13
 
14
14
  # 4. Text data storage
15
- from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, save_dict_to_excel
15
+ from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
16
16
 
17
17
  # 5. File/folder process
18
18
  from .PgsFile import FilePath, FileName, DirList
19
- from .PgsFile import get_subfolder_path, get_package_path
19
+ from .PgsFile import get_subfolder_path
20
20
  from .PgsFile import makedirec, makefile
21
- from .PgsFile import source_path, next_folder_names, corpus_root, get_directory_tree_with_meta, find_txt_files_with_keyword
21
+ from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
22
22
  from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
23
23
 
24
24
  # 6. Data cleaning
25
25
  from .PgsFile import BigPunctuation, StopTags, Special, yhd
26
- from .PgsFile import ZhStopWords, EnPunctuation, extract_stopwords
26
+ from .PgsFile import ZhStopWords, EnPunctuation
27
27
  from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
28
28
  from .PgsFile import check_contain_chinese, check_contain_number
29
29
  from .PgsFile import replace_chinese_punctuation_with_english
@@ -35,7 +35,7 @@ from .PgsFile import extract_chinese_punctuation, generate_password, sort_string
35
35
  from .PgsFile import strQ2B_raw, strQ2B_words
36
36
  from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
37
37
  from .PgsFile import word_list, batch_word_list
38
- from .PgsFile import cs, cs1, cs2
38
+ from .PgsFile import cs, cs1, sent_tokenize
39
39
 
40
40
  # 8. Maths
41
41
  from .PgsFile import len_rows, check_empty_cells
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PgsFile
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
5
5
  Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
6
6
  Author: Pan Guisheng
@@ -19,6 +19,7 @@ Requires-Dist: pip
19
19
  Requires-Dist: requests
20
20
  Requires-Dist: lxml
21
21
  Requires-Dist: pimht
22
+ Requires-Dist: pysbd
22
23
 
23
24
  Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
24
25
 
@@ -1,5 +1,5 @@
1
- PgsFile/PgsFile.py,sha256=jmSiczDE5cV47tHpCGDwLn19C90NGQtQ2vEn4ys4NUg,80514
2
- PgsFile/__init__.py,sha256=EKhIRd2tktjyrvBlBPgQsIJTqU7DdLIobNG8gEiZ--0,2163
1
+ PgsFile/PgsFile.py,sha256=6CXBDn3VC4gUkigNVCkM9eVPOe4Xyww32tG0ZDeYNfI,78446
2
+ PgsFile/__init__.py,sha256=TaKrLI0pGAFm_2Bzjf_cGnog_URzaAgHRW5myzY0Lz8,2144
3
3
  PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
4
4
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
5
5
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
2618
2618
  PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
2619
2619
  PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
2620
2620
  PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
2621
- PgsFile-0.1.6.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
- PgsFile-0.1.6.dist-info/METADATA,sha256=T0mBPq7PnljEcGjLItIJ3RIcZk7veOuy0vVgLuo31lo,4902
2623
- PgsFile-0.1.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
- PgsFile-0.1.6.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
- PgsFile-0.1.6.dist-info/RECORD,,
2621
+ PgsFile-0.1.7.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
+ PgsFile-0.1.7.dist-info/METADATA,sha256=0HAA5A68yHiB-LVlNuF-pkKo_lawzwTU-Thf-i2FiUY,4924
2623
+ PgsFile-0.1.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
+ PgsFile-0.1.7.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
+ PgsFile-0.1.7.dist-info/RECORD,,