PgsFile 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

PgsFile/PgsFile.py CHANGED
@@ -358,6 +358,20 @@ def write_to_json_lines(json_path,my_json_data):
358
358
  file.write(json_str + '\n')
359
359
  file.close()
360
360
 
361
+
362
+ # Function to append a dictionary to a JSON file
363
+ def append_dict_to_json(file_path, data_dict):
364
+ try:
365
+ import json
366
+ with open(file_path, 'a', encoding="utf-8") as file:
367
+ json_string = json.dumps(data_dict, ensure_ascii=False)
368
+ file.write(json_string + '\n')
369
+ # print(f"Dictionary appended to {file_path}")
370
+ except IOError as e:
371
+ print(f"An I/O error occurred: {e}")
372
+ except Exception as e:
373
+ print(f"An error occurred: {e}")
374
+
361
375
  def FilePath(root):
362
376
  '''读取所有文件,列出每个文件的路径'''
363
377
  import os
@@ -553,19 +567,6 @@ def next_folder_names(folder):
553
567
  folder_namelist=next(os.walk(folder))[1]
554
568
  return folder_namelist
555
569
 
556
- def get_package_path(package_name):
557
- import site
558
- import os
559
- package_paths=site.getsitepackages()
560
- package_path=None
561
- for path in package_paths:
562
- if os.path.exists(os.path.join(path, package_name)):
563
- package_path=os.path.join(path, package_name)
564
- break
565
-
566
- if package_path is None:
567
- raise ModuleNotFoundError(f"Package '{package_name}' not found.")
568
- return package_path
569
570
 
570
571
  def remove_empty_txts(folder_path):
571
572
  import os
@@ -631,77 +632,6 @@ def remove_empty_last_line(folder_path):
631
632
  f2.write(lines[i])
632
633
  f2.close()
633
634
  print(end_empty_files,str(len(end_empty_files))+" files found with last line empty!")
634
-
635
- corpus_root=get_package_path('PgsFile')+"/Corpora"
636
- def extract_stopwords(lang=None):
637
- '''
638
- Parameters
639
- ----------
640
- lang : TYPE, optional string
641
- DESCRIPTION. The default is None.
642
- lang="english"; lang="chinese" etc.
643
-
644
- Returns
645
- -------
646
- contents : TYPE list
647
- DESCRIPTION. ["'ll", "'tis", "'twas", "'ve", '10', '39', 'a', "a's", 'able', 'ableabout', 'about', 'above', 'abroad', 'abst', 'accordance', 'according']
648
-
649
- '''
650
- import os
651
- # Check if the folder exists
652
- if not os.path.isdir(corpus_root):
653
- print(f"Error: The folder '{corpus_root}' does not exist.")
654
- return None
655
-
656
- if lang is None:
657
- language="english"
658
- else:
659
- language=lang
660
- file_name=language+".txt"
661
-
662
- # Traverse the folder recursively
663
- for root, dirs, files in os.walk(corpus_root):
664
- # Check if the text file exists in the current folder
665
- if file_name in files:
666
- # Construct the full path to the text file
667
- file_path=os.path.join(root, file_name)
668
- # Read the contents of the text file
669
- contents=[line.strip() for line in get_data_lines(file_path)]
670
- return contents
671
-
672
- # If the text file doesn't exist in any folder, print an error message
673
- print(f"Error: The file '{file_name}' does not exist in the folder '{corpus_root}' or its sub-folders.")
674
- return None
675
-
676
- pickle_root=get_package_path('PgsFile')+"/models"
677
- def load_pickle_data(lang=None):
678
- '''
679
- Parameters
680
- ----------
681
- lang : TYPE, optional
682
- DESCRIPTION. The default is None.
683
- lang="english"; lang="chinese" etc.
684
- Returns
685
- -------
686
- data : TYPE
687
- DESCRIPTION.
688
-
689
- '''
690
- import pickle
691
- files=FilePath(pickle_root)
692
- if lang is None:
693
- language="english"
694
- else:
695
- language=lang
696
- file_path=""
697
- for file in files:
698
- if language in FileName(file):
699
- file_path=file
700
- with open(file_path, 'rb') as handle:
701
- data=pickle.load(handle)
702
- return data
703
-
704
-
705
635
 
706
636
  def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
707
637
  """
@@ -731,8 +661,14 @@ def find_txt_files_with_keyword(root_folder, keyword, case_sensitive=None):
731
661
 
732
662
  # Standard sentence tokenizer.
733
663
  def sent_tokenize(text, lang=None):
734
- tokenizer=load_pickle_data(lang)
735
- return tokenizer.tokenize(text)
664
+ import pysbd
665
+ if lang is None:
666
+ lang="en"
667
+ else:
668
+ lang=lang
669
+ seg = pysbd.Segmenter(language=lang, clean=False)
670
+ sent_list = seg.segment(text)
671
+ return sent_list
736
672
 
737
673
  def cs(para):
738
674
  """
@@ -757,18 +693,7 @@ def cs(para):
757
693
  return paras
758
694
 
759
695
 
760
- def cs1(para): #英文分句
761
- """
762
- #英文分句
763
- using nltk model
764
- ---------
765
- Returns
766
- list
767
- """
768
- return sent_tokenize(para)
769
-
770
-
771
- def cs2(text):
696
+ def cs1(text):
772
697
  """
773
698
  #英文分句
774
699
  using regular expression
@@ -811,7 +736,7 @@ def cs2(text):
811
736
  sentences=sentences[:-1]
812
737
  sentences=[s.strip() for s in sentences]
813
738
  if len(sentences)==0:
814
- sentences=cs1(text)
739
+ sentences=sent_tokenize(text)
815
740
  else:
816
741
  sentences=sentences
817
742
  return sentences
@@ -1246,6 +1171,11 @@ import requests
1246
1171
  from lxml import html, etree
1247
1172
  import pandas as pd
1248
1173
  my_headers={"User-Agent": random.choice(yhd)}
1174
+
1175
+ from fake_useragent import UserAgent
1176
+ ua = UserAgent()
1177
+ headers = {"User-Agent": ua.random}
1178
+
1249
1179
  class PGScraper(object):
1250
1180
  def __init__(self):
1251
1181
  self.pattern=[]
@@ -1409,8 +1339,8 @@ Showing download progress and speed when audio-visual files like MP4, MP3, JPG e
1409
1339
  import time
1410
1340
  from contextlib import closing
1411
1341
 
1412
- def audiovisual_downloader(url, path):
1413
- with closing(requests.get(url, stream=True, headers=my_headers)) as r:
1342
+ def audiovisual_downloader(url, path, headers=None):
1343
+ with closing(requests.get(url, stream=True, headers=headers)) as r:
1414
1344
  chunk_size=1024*10
1415
1345
  content_size=int(r.headers['content-length'])
1416
1346
  print('Initiating download...')
@@ -1697,7 +1627,7 @@ def get_data_html_online(url, html=True, timeout=None, headers=None, cookies=Non
1697
1627
  r.encoding="utf-8"
1698
1628
  data=r.text
1699
1629
  html=etree.HTML(data)
1700
- return html
1630
+ return html, data
1701
1631
  else:
1702
1632
  print(r.status_code, "Can not find the page!")
1703
1633
  return None
@@ -1713,8 +1643,9 @@ def find_table_with_most_rows(tables):
1713
1643
  max_table_index=i
1714
1644
  return max_table_index, max_rows if max_table_index!= -1 else None
1715
1645
 
1716
- def get_data_table(url, output_file, most_rows=True):
1646
+ def get_data_table_url(url, output_file, most_rows=True):
1717
1647
  try:
1648
+ # Wrap the HTML string in a StringIO object
1718
1649
  tables=pd.read_html(url)
1719
1650
  if most_rows==False:
1720
1651
  # 1. default: the first table
@@ -1724,6 +1655,26 @@ def get_data_table(url, output_file, most_rows=True):
1724
1655
  target_table=find_table_with_most_rows(tables)[0] # (1, 32)
1725
1656
  df=tables[target_table]
1726
1657
 
1658
+ df.to_excel(output_file, index=False)
1659
+ print(f"Data has been saved to {output_file}")
1660
+ except Exception as err:
1661
+ print(f"Errors found! {err}")
1662
+ return None
1663
+
1664
+ def get_data_table_html_string(html_string, output_file, most_rows=True):
1665
+ try:
1666
+ # Wrap the HTML string in a StringIO object
1667
+ from io import StringIO
1668
+ html_io = StringIO(html_string)
1669
+ tables=pd.read_html(html_io)
1670
+ if most_rows==False:
1671
+ # 1. default: the first table
1672
+ df=tables[0]
1673
+ else:
1674
+ # 2. get the table with most rows
1675
+ target_table=find_table_with_most_rows(tables)[0] # (1, 32)
1676
+ df=tables[target_table]
1677
+
1727
1678
  df.to_excel(output_file, index=False)
1728
1679
  print(f"Data has been saved to {output_file}")
1729
1680
  except Exception as err:
PgsFile/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
1
  # 1. Web scraping
2
2
  from .PgsFile import PGScraper
3
3
  from .PgsFile import audiovisual_downloader
4
+ from .PgsFile import headers
4
5
 
5
6
  # 2. Package/library management
6
7
  from .PgsFile import install_package, uninstall_package
@@ -9,21 +10,22 @@ from .PgsFile import run_script, run_command
9
10
  # 3. Text data retrieval
10
11
  from .PgsFile import get_data_text, get_data_lines, get_json_lines, get_tsv_lines
11
12
  from .PgsFile import get_data_excel, get_data_json, get_data_tsv, extract_misspelled_words_from_docx
12
- from .PgsFile import get_data_html_online, get_data_html_offline, get_data_table
13
+ from .PgsFile import get_data_html_online, get_data_html_offline
14
+ from .PgsFile import get_data_table_url, get_data_table_html_string
13
15
 
14
16
  # 4. Text data storage
15
- from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, save_dict_to_excel
17
+ from .PgsFile import write_to_txt, write_to_excel, write_to_json, write_to_json_lines, append_dict_to_json, save_dict_to_excel
16
18
 
17
19
  # 5. File/folder process
18
20
  from .PgsFile import FilePath, FileName, DirList
19
- from .PgsFile import get_subfolder_path, get_package_path
21
+ from .PgsFile import get_subfolder_path
20
22
  from .PgsFile import makedirec, makefile
21
- from .PgsFile import source_path, next_folder_names, corpus_root, get_directory_tree_with_meta, find_txt_files_with_keyword
23
+ from .PgsFile import source_path, next_folder_names, get_directory_tree_with_meta, find_txt_files_with_keyword
22
24
  from .PgsFile import remove_empty_folders, remove_empty_txts, remove_empty_lines, remove_empty_last_line, move_file
23
25
 
24
26
  # 6. Data cleaning
25
27
  from .PgsFile import BigPunctuation, StopTags, Special, yhd
26
- from .PgsFile import ZhStopWords, EnPunctuation, extract_stopwords
28
+ from .PgsFile import ZhStopWords, EnPunctuation
27
29
  from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
28
30
  from .PgsFile import check_contain_chinese, check_contain_number
29
31
  from .PgsFile import replace_chinese_punctuation_with_english
@@ -35,7 +37,7 @@ from .PgsFile import extract_chinese_punctuation, generate_password, sort_string
35
37
  from .PgsFile import strQ2B_raw, strQ2B_words
36
38
  from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
37
39
  from .PgsFile import word_list, batch_word_list
38
- from .PgsFile import cs, cs1, cs2
40
+ from .PgsFile import cs, cs1, sent_tokenize
39
41
 
40
42
  # 8. Maths
41
43
  from .PgsFile import len_rows, check_empty_cells
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PgsFile
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: This module aims to simplify Python package management, script execution, file handling, web scraping, multimedia download, data cleaning, and word list generation for literary students, making it more accessible and convenient to use.
5
5
  Home-page: https://mp.weixin.qq.com/s/F94jyCBOQ3VmiPmSjv6ZAw
6
6
  Author: Pan Guisheng
@@ -17,8 +17,10 @@ Requires-Dist: pandas
17
17
  Requires-Dist: python-docx
18
18
  Requires-Dist: pip
19
19
  Requires-Dist: requests
20
+ Requires-Dist: fake-useragent
20
21
  Requires-Dist: lxml
21
22
  Requires-Dist: pimht
23
+ Requires-Dist: pysbd
22
24
 
23
25
  Purpose: This module aims to assist Python beginners, particularly instructors and students of foreign languages and literature, by providing a convenient way to manage Python packages, run Python scripts, and perform operations on various file types such as txt, xlsx, json, tsv, html, mhtml, and docx. It also includes functionality for data scraping, cleaning and generating word lists.
24
26
 
@@ -1,5 +1,5 @@
1
- PgsFile/PgsFile.py,sha256=jmSiczDE5cV47tHpCGDwLn19C90NGQtQ2vEn4ys4NUg,80514
2
- PgsFile/__init__.py,sha256=EKhIRd2tktjyrvBlBPgQsIJTqU7DdLIobNG8gEiZ--0,2163
1
+ PgsFile/PgsFile.py,sha256=70VLJybFPPbaxPxb5z_vnbEZG9fVgkO7nxcxI77auUI,79354
2
+ PgsFile/__init__.py,sha256=vJqj1rfxK-2UrS4m3nCg0yglqsF_8P-DPAnDgxTdAj8,2227
3
3
  PgsFile/Corpora/Idioms/English_Idioms_8774.txt,sha256=qlsP0yI_XGECBRiPZuLkGZpdasc77sWSKexANu7v8_M,175905
4
4
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000000.txt,sha256=SLGGSMSb7Ff1RoBstsTW3yX2wNZpqEUchFNpcI-mrR4,1513
5
5
  PgsFile/Corpora/Monolingual/Chinese/People's Daily 20130605/Raw/00000001.txt,sha256=imOa6UoCOIZoPXT4_HNHgCUJtd4FTIdk2FZNHNBgJyg,3372
@@ -2618,8 +2618,8 @@ PgsFile/models/slovene.pickle,sha256=faxlAhKzeHs5mWwBvSCEEVST5vbsOQurYfdnUlsIuOo
2618
2618
  PgsFile/models/spanish.pickle,sha256=Jx3GAnxKrgVvcqm_q1ZFz2fhmL9PlyiVhE5A9ZiczcM,597831
2619
2619
  PgsFile/models/swedish.pickle,sha256=QNUOva1sqodxXy4wCxIX7JLELeIFpUPMSlaQO9LJrPo,1034496
2620
2620
  PgsFile/models/turkish.pickle,sha256=065H12UB0CdpiAnRLnUpLJw5KRBIhUM0KAL5Xbl2XMw,1225013
2621
- PgsFile-0.1.6.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
- PgsFile-0.1.6.dist-info/METADATA,sha256=T0mBPq7PnljEcGjLItIJ3RIcZk7veOuy0vVgLuo31lo,4902
2623
- PgsFile-0.1.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
- PgsFile-0.1.6.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
- PgsFile-0.1.6.dist-info/RECORD,,
2621
+ PgsFile-0.1.8.dist-info/LICENSE,sha256=cE5c-QToSkG1KTUsU8drQXz1vG0EbJWuU4ybHTRb5SE,1138
2622
+ PgsFile-0.1.8.dist-info/METADATA,sha256=oSsVQL9ZZBLhk83DN22SrtrSh3r_V8D7ILYfHv1SJbo,4955
2623
+ PgsFile-0.1.8.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
2624
+ PgsFile-0.1.8.dist-info/top_level.txt,sha256=028hCfwhF3UpfD6X0rwtWpXI1RKSTeZ1ALwagWaSmX8,8
2625
+ PgsFile-0.1.8.dist-info/RECORD,,