PgsFile 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +64 -1
- PgsFile/__init__.py +2 -1
- PgsFile/models/NLPIR.user +0 -0
- PgsFile/models/dics/BE21.json +1 -0
- PgsFile/models/prompts/4. OCR prompt.txt +1 -0
- {PgsFile-0.4.1.dist-info → PgsFile-0.4.3.dist-info}/METADATA +6 -6
- {PgsFile-0.4.1.dist-info → PgsFile-0.4.3.dist-info}/RECORD +10 -8
- {PgsFile-0.4.1.dist-info → PgsFile-0.4.3.dist-info}/LICENSE +0 -0
- {PgsFile-0.4.1.dist-info → PgsFile-0.4.3.dist-info}/WHEEL +0 -0
- {PgsFile-0.4.1.dist-info → PgsFile-0.4.3.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -3953,6 +3953,43 @@ def extract_keywords_en(target_text, top_n=10):
|
|
|
3953
3953
|
# Return top N keywords
|
|
3954
3954
|
return keyword_scores[:top_n]
|
|
3955
3955
|
|
|
3956
|
+
def extract_keywords_en_be21(target_text, top_n=10):
|
|
3957
|
+
"""Extract keywords from target text using log-likelihood with absolute reference frequencies."""
|
|
3958
|
+
# Example usage
|
|
3959
|
+
my_dic_path = get_library_location("PgsFile")+"/PgsFile/models/dics/BE21.json" # BE21_wordlist
|
|
3960
|
+
reference_freq = get_data_json(my_dic_path)
|
|
3961
|
+
# Tokenize target text and preserve original case
|
|
3962
|
+
original_words = word_tokenize2(target_text)
|
|
3963
|
+
lower_words = [w.lower() for w in original_words if w.lower() not in BigPunctuation and w.lower() not in get_stopwords()]
|
|
3964
|
+
total_target = len(lower_words)
|
|
3965
|
+
|
|
3966
|
+
# Calculate target word frequencies
|
|
3967
|
+
target_word_freq = defaultdict(int)
|
|
3968
|
+
word_case_mapping = {}
|
|
3969
|
+
for orig_word, lower_word in zip(original_words, [w.lower() for w in original_words]):
|
|
3970
|
+
if lower_word in lower_words:
|
|
3971
|
+
target_word_freq[lower_word] += 1
|
|
3972
|
+
if lower_word not in word_case_mapping:
|
|
3973
|
+
word_case_mapping[lower_word] = orig_word
|
|
3974
|
+
|
|
3975
|
+
# Calculate total reference frequency
|
|
3976
|
+
total_reference = sum(reference_freq.values())
|
|
3977
|
+
|
|
3978
|
+
# Calculate log-likelihood for each word
|
|
3979
|
+
keyword_scores = []
|
|
3980
|
+
for word, target_count in target_word_freq.items():
|
|
3981
|
+
reference_count = reference_freq.get(word, 0)
|
|
3982
|
+
ll = calculate_log_likelihood(target_count, reference_count, total_target, total_reference)
|
|
3983
|
+
relative_freq = target_count / total_target
|
|
3984
|
+
original_word = word_case_mapping.get(word, word)
|
|
3985
|
+
keyword_scores.append((original_word, target_count, relative_freq, ll))
|
|
3986
|
+
|
|
3987
|
+
# Sort keywords by log-likelihood score
|
|
3988
|
+
keyword_scores.sort(key=lambda x: x[3], reverse=True)
|
|
3989
|
+
|
|
3990
|
+
# Return top N keywords
|
|
3991
|
+
return keyword_scores[:top_n]
|
|
3992
|
+
|
|
3956
3993
|
def resize_image(input_image_path, output_image_path, max_size_kb):
|
|
3957
3994
|
'''
|
|
3958
3995
|
# Example 1: Resizing a JPG image
|
|
@@ -4000,4 +4037,30 @@ def resize_image(input_image_path, output_image_path, max_size_kb):
|
|
|
4000
4037
|
if size <= max_size_kb:
|
|
4001
4038
|
print(f"Image resized successfully to {size} KB.")
|
|
4002
4039
|
else:
|
|
4003
|
-
print("Could not reduce the image size below 2MB.")
|
|
4040
|
+
print("Could not reduce the image size below 2MB.")
|
|
4041
|
+
|
|
4042
|
+
import base64
|
|
4043
|
+
def convert_image_to_url(image_path: str) -> str:
|
|
4044
|
+
"""
|
|
4045
|
+
Convert an image file to a base64 encoded URL format.
|
|
4046
|
+
|
|
4047
|
+
:param image_path: Path to the image file.
|
|
4048
|
+
:return: A string representing the image in the required URL format.
|
|
4049
|
+
"""
|
|
4050
|
+
# Check if the file exists
|
|
4051
|
+
if not os.path.isfile(image_path):
|
|
4052
|
+
raise FileNotFoundError(f"The file {image_path} does not exist.")
|
|
4053
|
+
|
|
4054
|
+
# Open and read the image file in binary mode
|
|
4055
|
+
with open(image_path, "rb") as f:
|
|
4056
|
+
image_data = f.read()
|
|
4057
|
+
|
|
4058
|
+
# Extract the file extension and convert it to base64
|
|
4059
|
+
file_extension = os.path.splitext(image_path)[1][1:]
|
|
4060
|
+
base64_image_data = base64.b64encode(image_data).decode('utf-8')
|
|
4061
|
+
|
|
4062
|
+
# Create the image URL
|
|
4063
|
+
image_url = f"data:image/{file_extension};base64,{base64_image_data}"
|
|
4064
|
+
|
|
4065
|
+
return image_url
|
|
4066
|
+
|
PgsFile/__init__.py
CHANGED
|
@@ -49,7 +49,7 @@ from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity,
|
|
|
49
49
|
from .PgsFile import word_list, batch_word_list
|
|
50
50
|
from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
|
|
51
51
|
from .PgsFile import word_lemmatize, word_POS, word_NER
|
|
52
|
-
from .PgsFile import extract_noun_phrases, get_LLMs_prompt, extract_keywords_en
|
|
52
|
+
from .PgsFile import extract_noun_phrases, get_LLMs_prompt, extract_keywords_en, extract_keywords_en_be21
|
|
53
53
|
from .PgsFile import extract_dependency_relations, extract_dependency_relations_full
|
|
54
54
|
from .PgsFile import predict_category
|
|
55
55
|
|
|
@@ -64,5 +64,6 @@ from .PgsFile import timeit
|
|
|
64
64
|
from .PgsFile import replace_white_with_transparency
|
|
65
65
|
from .PgsFile import simhei_default_font_path_MacOS_Windows
|
|
66
66
|
from .PgsFile import get_font_path, resize_image
|
|
67
|
+
from .PgsFile import convert_image_to_url
|
|
67
68
|
|
|
68
69
|
name = "PgsFile"
|
PgsFile/models/NLPIR.user
CHANGED
|
Binary file
|