PgsFile 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

PgsFile/PgsFile.py CHANGED
@@ -3953,6 +3953,43 @@ def extract_keywords_en(target_text, top_n=10):
3953
3953
  # Return top N keywords
3954
3954
  return keyword_scores[:top_n]
3955
3955
 
3956
+ def extract_keywords_en_be21(target_text, top_n=10):
3957
+ """Extract keywords from target text using log-likelihood with absolute reference frequencies."""
3958
+ # Example usage
3959
+ my_dic_path = get_library_location("PgsFile")+"/PgsFile/models/dics/BE21.json" # BE21_wordlist
3960
+ reference_freq = get_data_json(my_dic_path)
3961
+ # Tokenize target text and preserve original case
3962
+ original_words = word_tokenize2(target_text)
3963
+ lower_words = [w.lower() for w in original_words if w.lower() not in BigPunctuation and w.lower() not in get_stopwords()]
3964
+ total_target = len(lower_words)
3965
+
3966
+ # Calculate target word frequencies
3967
+ target_word_freq = defaultdict(int)
3968
+ word_case_mapping = {}
3969
+ for orig_word, lower_word in zip(original_words, [w.lower() for w in original_words]):
3970
+ if lower_word in lower_words:
3971
+ target_word_freq[lower_word] += 1
3972
+ if lower_word not in word_case_mapping:
3973
+ word_case_mapping[lower_word] = orig_word
3974
+
3975
+ # Calculate total reference frequency
3976
+ total_reference = sum(reference_freq.values())
3977
+
3978
+ # Calculate log-likelihood for each word
3979
+ keyword_scores = []
3980
+ for word, target_count in target_word_freq.items():
3981
+ reference_count = reference_freq.get(word, 0)
3982
+ ll = calculate_log_likelihood(target_count, reference_count, total_target, total_reference)
3983
+ relative_freq = target_count / total_target
3984
+ original_word = word_case_mapping.get(word, word)
3985
+ keyword_scores.append((original_word, target_count, relative_freq, ll))
3986
+
3987
+ # Sort keywords by log-likelihood score
3988
+ keyword_scores.sort(key=lambda x: x[3], reverse=True)
3989
+
3990
+ # Return top N keywords
3991
+ return keyword_scores[:top_n]
3992
+
3956
3993
  def resize_image(input_image_path, output_image_path, max_size_kb):
3957
3994
  '''
3958
3995
  # Example 1: Resizing a JPG image
@@ -4000,4 +4037,30 @@ def resize_image(input_image_path, output_image_path, max_size_kb):
4000
4037
  if size <= max_size_kb:
4001
4038
  print(f"Image resized successfully to {size} KB.")
4002
4039
  else:
4003
- print("Could not reduce the image size below 2MB.")
4040
+ print("Could not reduce the image size below 2MB.")
4041
+
4042
+ import base64
4043
+ def convert_image_to_url(image_path: str) -> str:
4044
+ """
4045
+ Convert an image file to a base64 encoded URL format.
4046
+
4047
+ :param image_path: Path to the image file.
4048
+ :return: A string representing the image in the required URL format.
4049
+ """
4050
+ # Check if the file exists
4051
+ if not os.path.isfile(image_path):
4052
+ raise FileNotFoundError(f"The file {image_path} does not exist.")
4053
+
4054
+ # Open and read the image file in binary mode
4055
+ with open(image_path, "rb") as f:
4056
+ image_data = f.read()
4057
+
4058
+ # Extract the file extension and convert it to base64
4059
+ file_extension = os.path.splitext(image_path)[1][1:]
4060
+ base64_image_data = base64.b64encode(image_data).decode('utf-8')
4061
+
4062
+ # Create the image URL
4063
+ image_url = f"data:image/{file_extension};base64,{base64_image_data}"
4064
+
4065
+ return image_url
4066
+
PgsFile/__init__.py CHANGED
@@ -49,7 +49,7 @@ from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity,
49
49
  from .PgsFile import word_list, batch_word_list
50
50
  from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
51
51
  from .PgsFile import word_lemmatize, word_POS, word_NER
52
- from .PgsFile import extract_noun_phrases, get_LLMs_prompt, extract_keywords_en
52
+ from .PgsFile import extract_noun_phrases, get_LLMs_prompt, extract_keywords_en, extract_keywords_en_be21
53
53
  from .PgsFile import extract_dependency_relations, extract_dependency_relations_full
54
54
  from .PgsFile import predict_category
55
55
 
@@ -64,5 +64,6 @@ from .PgsFile import timeit
64
64
  from .PgsFile import replace_white_with_transparency
65
65
  from .PgsFile import simhei_default_font_path_MacOS_Windows
66
66
  from .PgsFile import get_font_path, resize_image
67
+ from .PgsFile import convert_image_to_url
67
68
 
68
69
  name = "PgsFile"
PgsFile/models/NLPIR.user CHANGED
Binary file