PgsFile 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/PgsFile.py +241 -0
- PgsFile/__init__.py +6 -2
- PgsFile/models/dics/BNC_COCA_lists.xlsx +0 -0
- PgsFile/models/dics/CET-4 2016 (6314).txt +6314 -0
- PgsFile/models/dics/CET-6 2016 (1726).txt +1726 -0
- PgsFile/models/prompts/1. MIP prompt.txt +34 -0
- PgsFile/models/prompts/2. WSD prompt.txt +65 -0
- PgsFile-0.2.8.dist-info/METADATA +41 -0
- {PgsFile-0.2.7.dist-info → PgsFile-0.2.8.dist-info}/RECORD +12 -7
- PgsFile-0.2.7.dist-info/METADATA +0 -41
- {PgsFile-0.2.7.dist-info → PgsFile-0.2.8.dist-info}/LICENSE +0 -0
- {PgsFile-0.2.7.dist-info → PgsFile-0.2.8.dist-info}/WHEEL +0 -0
- {PgsFile-0.2.7.dist-info → PgsFile-0.2.8.dist-info}/top_level.txt +0 -0
PgsFile/PgsFile.py
CHANGED
|
@@ -2331,3 +2331,244 @@ def delete_permanent_environment_variable(variable_name, system_wide=False):
|
|
|
2331
2331
|
except Exception as e:
|
|
2332
2332
|
print(f'An error occurred: {e}')
|
|
2333
2333
|
|
|
2334
|
+
|
|
2335
|
+
def calculate_mean_dependency_distance(spacy_doc):
|
|
2336
|
+
"""
|
|
2337
|
+
Calculate the mean dependency distance for tokens in a spaCy Doc object.
|
|
2338
|
+
|
|
2339
|
+
The dependency distance is the absolute difference in positions between a token
|
|
2340
|
+
and its syntactic head. This function computes the average of these distances
|
|
2341
|
+
for all tokens in the Doc object, excluding punctuation and the root token.
|
|
2342
|
+
|
|
2343
|
+
Parameters:
|
|
2344
|
+
spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
|
|
2345
|
+
|
|
2346
|
+
Returns:
|
|
2347
|
+
float: The mean dependency distance. Returns 0 if there are no valid tokens to analyze.
|
|
2348
|
+
"""
|
|
2349
|
+
|
|
2350
|
+
doc=spacy_doc
|
|
2351
|
+
total_distance = 0
|
|
2352
|
+
count = 0
|
|
2353
|
+
|
|
2354
|
+
for token in doc:
|
|
2355
|
+
if token.dep_ not in ("punct", "ROOT"):
|
|
2356
|
+
distance = abs(list(doc).index(token.head) - list(doc).index(token))
|
|
2357
|
+
total_distance += distance
|
|
2358
|
+
count += 1
|
|
2359
|
+
|
|
2360
|
+
if count == 0:
|
|
2361
|
+
return 0
|
|
2362
|
+
|
|
2363
|
+
mean_distance = total_distance / count
|
|
2364
|
+
return mean_distance
|
|
2365
|
+
|
|
2366
|
+
def word_lemmatize(spacy_doc):
|
|
2367
|
+
"""
|
|
2368
|
+
Lemmatize the words in a spaCy Doc object and return the lemmatized text.
|
|
2369
|
+
|
|
2370
|
+
This function processes each token in the Doc object, replacing it with its lemma
|
|
2371
|
+
unless the lemma is '-PRON-', in which case the original text of the token is used.
|
|
2372
|
+
The resulting lemmatized words are joined into a single string.
|
|
2373
|
+
|
|
2374
|
+
Parameters:
|
|
2375
|
+
spacy_doc (spacy.tokens.Doc): The spaCy Doc object to lemmatize.
|
|
2376
|
+
|
|
2377
|
+
Returns:
|
|
2378
|
+
str: The lemmatized text as a single string.
|
|
2379
|
+
"""
|
|
2380
|
+
doc = spacy_doc
|
|
2381
|
+
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in doc])
|
|
2382
|
+
return text
|
|
2383
|
+
|
|
2384
|
+
def word_NER(spacy_doc):
|
|
2385
|
+
"""
|
|
2386
|
+
Extract Named Entities from a spaCy Doc object.
|
|
2387
|
+
|
|
2388
|
+
This function processes the Doc object to identify and extract named entities,
|
|
2389
|
+
returning a list of tuples where each tuple contains the entity text and its label.
|
|
2390
|
+
|
|
2391
|
+
Parameters:
|
|
2392
|
+
spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
|
|
2393
|
+
|
|
2394
|
+
Returns:
|
|
2395
|
+
list of tuples: A list of tuples where each tuple contains the entity text and its label.
|
|
2396
|
+
"""
|
|
2397
|
+
doc = spacy_doc
|
|
2398
|
+
entities = [(ent.text, ent.label_) for ent in doc.ents]
|
|
2399
|
+
return entities
|
|
2400
|
+
|
|
2401
|
+
def word_POS(spacy_doc):
|
|
2402
|
+
"""
|
|
2403
|
+
Extract Part-Of-Speech (POS) tags from a spaCy Doc object.
|
|
2404
|
+
|
|
2405
|
+
This function processes the Doc object to identify and extract the POS tags for each token,
|
|
2406
|
+
returning a list of tuples where each tuple contains the token text and its corresponding POS tag.
|
|
2407
|
+
|
|
2408
|
+
Parameters:
|
|
2409
|
+
spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
|
|
2410
|
+
|
|
2411
|
+
Returns:
|
|
2412
|
+
list of tuples: A list of tuples where each tuple contains the token text and its POS tag.
|
|
2413
|
+
"""
|
|
2414
|
+
doc = spacy_doc
|
|
2415
|
+
pos_tags = [(token.text, token.pos_) for token in doc]
|
|
2416
|
+
return pos_tags
|
|
2417
|
+
|
|
2418
|
+
def extract_noun_phrases(spacy_doc):
|
|
2419
|
+
"""
|
|
2420
|
+
Extract noun phrases from a spaCy Doc object.
|
|
2421
|
+
|
|
2422
|
+
This function processes the Doc object to identify and extract noun phrases,
|
|
2423
|
+
returning a list of strings where each string is a noun phrase.
|
|
2424
|
+
|
|
2425
|
+
Parameters:
|
|
2426
|
+
spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
|
|
2427
|
+
|
|
2428
|
+
Returns:
|
|
2429
|
+
list of str: A list of noun phrases extracted from the Doc object.
|
|
2430
|
+
"""
|
|
2431
|
+
doc = spacy_doc
|
|
2432
|
+
noun_phrases = [chunk.text for chunk in doc.noun_chunks]
|
|
2433
|
+
return noun_phrases
|
|
2434
|
+
|
|
2435
|
+
def extract_dependency_relations(spacy_doc):
|
|
2436
|
+
"""
|
|
2437
|
+
Extract the dependency relations for each word in a spaCy Doc object.
|
|
2438
|
+
|
|
2439
|
+
This function processes the Doc object to identify and extract the dependency relations
|
|
2440
|
+
for each token, returning a list of tuples where each tuple contains the token text,
|
|
2441
|
+
its dependency relation, and the text of its syntactic head.
|
|
2442
|
+
|
|
2443
|
+
Parameters:
|
|
2444
|
+
spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
|
|
2445
|
+
|
|
2446
|
+
Returns:
|
|
2447
|
+
list of tuples: A list of tuples where each tuple contains the token text,
|
|
2448
|
+
its dependency relation, and the text of its syntactic head.
|
|
2449
|
+
"""
|
|
2450
|
+
doc = spacy_doc
|
|
2451
|
+
dependency_relations = [(token.text, token.dep_, token.head.text) for token in doc]
|
|
2452
|
+
return dependency_relations
|
|
2453
|
+
|
|
2454
|
+
def extract_dependency_relations_full(spacy_doc):
|
|
2455
|
+
"""
|
|
2456
|
+
Extract comprehensive dependency relations for each word in a spaCy Doc object.
|
|
2457
|
+
|
|
2458
|
+
This function processes the Doc object to identify and extract detailed dependency relations
|
|
2459
|
+
for each token. It returns a list of tuples where each tuple contains the token text,
|
|
2460
|
+
its lemmatized form, its part-of-speech (POS) tag, its dependency relation, the text of its
|
|
2461
|
+
syntactic head, and a list of its child tokens.
|
|
2462
|
+
|
|
2463
|
+
Parameters:
|
|
2464
|
+
spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
|
|
2465
|
+
|
|
2466
|
+
Returns:
|
|
2467
|
+
list of tuples: A list of tuples where each tuple contains:
|
|
2468
|
+
- The token text
|
|
2469
|
+
- The lemmatized form of the token
|
|
2470
|
+
- The POS tag of the token
|
|
2471
|
+
- The dependency relation of the token
|
|
2472
|
+
- The text of the token's syntactic head
|
|
2473
|
+
- A list of the text of the token's child tokens
|
|
2474
|
+
"""
|
|
2475
|
+
doc = spacy_doc
|
|
2476
|
+
dependency_relations = [(token.text, token.lemma_, token.pos_, token.dep_, token.head.text, [child.text for child in token.children]) for token in doc]
|
|
2477
|
+
return dependency_relations
|
|
2478
|
+
|
|
2479
|
+
usua_tag_set = {
|
|
2480
|
+
"A": "General & Abstract Terms",
|
|
2481
|
+
"B": "The Body & the Individual",
|
|
2482
|
+
"C": "Arts & Crafts",
|
|
2483
|
+
"E": "Emotional Actions, States & Processes",
|
|
2484
|
+
"F": "Food & Farming",
|
|
2485
|
+
"G": "Government & the Public Domain",
|
|
2486
|
+
"H": "Architecture, Building, Houses & the Home",
|
|
2487
|
+
"I": "Money & Commerce",
|
|
2488
|
+
"K": "Entertainment, Sports & Games",
|
|
2489
|
+
"L": "Life & Living Things",
|
|
2490
|
+
"M": "Movement, Location, Travel & Transport",
|
|
2491
|
+
"N": "Numbers & Measurement",
|
|
2492
|
+
"O": "Substances, Materials, Objects & Equipment",
|
|
2493
|
+
"P": "Education",
|
|
2494
|
+
"Q": "Linguistic Actions, States & Processes",
|
|
2495
|
+
"S": "Social Actions, States & Processes",
|
|
2496
|
+
"T": "Time",
|
|
2497
|
+
"W": "The World & Our Environment",
|
|
2498
|
+
"X": "Psychological Actions, States & Processes",
|
|
2499
|
+
"Y": "Science & Technology",
|
|
2500
|
+
"Z": "Names & Grammatical Words"
|
|
2501
|
+
}
|
|
2502
|
+
|
|
2503
|
+
|
|
2504
|
+
def get_CET_dics(name=None):
|
|
2505
|
+
'''
|
|
2506
|
+
Parameters
|
|
2507
|
+
----------
|
|
2508
|
+
name : TYPE, string: like 'CET-4', 'CET-6', etc.
|
|
2509
|
+
DESCRIPTION. The default is None.
|
|
2510
|
+
|
|
2511
|
+
Returns
|
|
2512
|
+
-------
|
|
2513
|
+
TYPE, list: like ['a', 'an', 'abandon', 'able', 'ability', 'aboard', 'abolish', 'abolition', 'about', 'above', 'abroad', 'absent', 'absence', 'absolute', 'absorb']
|
|
2514
|
+
DESCRIPTION. The default will return a list of English CET (China's College English Test band 4 & 6) words.
|
|
2515
|
+
'''
|
|
2516
|
+
|
|
2517
|
+
dic_path=get_library_location("PgsFile")+"/PgsFile/models/dics"
|
|
2518
|
+
if name is None:
|
|
2519
|
+
cet_words=get_data_lines(find_txt_files_with_keyword(dic_path, "cet-4")[0])
|
|
2520
|
+
return cet_words
|
|
2521
|
+
else:
|
|
2522
|
+
cet_words=get_data_lines(find_txt_files_with_keyword(dic_path, name)[0])
|
|
2523
|
+
return cet_words
|
|
2524
|
+
|
|
2525
|
+
def get_BNC_dic():
|
|
2526
|
+
'''
|
|
2527
|
+
Returns
|
|
2528
|
+
-------
|
|
2529
|
+
TYPE, pandas dataframe:
|
|
2530
|
+
List ... Total frequency
|
|
2531
|
+
0 1k ... 2525253
|
|
2532
|
+
1 1k ... 47760
|
|
2533
|
+
2 1k ... 192168
|
|
2534
|
+
3 1k ... 25370
|
|
2535
|
+
4 1k ... 9284
|
|
2536
|
+
... ... ...
|
|
2537
|
+
24997 25k ... 0
|
|
2538
|
+
24998 25k ... 0
|
|
2539
|
+
24999 25k ... 9
|
|
2540
|
+
25000 25k ... 4
|
|
2541
|
+
25001 25k ... 9
|
|
2542
|
+
|
|
2543
|
+
[25002 rows x 4 columns]
|
|
2544
|
+
DESCRIPTION. The default will return a dataframe of the most commonly used English word list based on the BNC-COCA corpus.
|
|
2545
|
+
'''
|
|
2546
|
+
import pandas as pd
|
|
2547
|
+
inter=get_library_location("PgsFile")+"/PgsFile/models/dics"
|
|
2548
|
+
dic_path=get_full_path(inter, "BNC_COCA_lists.xlsx")
|
|
2549
|
+
print(dic_path)
|
|
2550
|
+
df=pd.read_excel(dic_path)
|
|
2551
|
+
return df
|
|
2552
|
+
|
|
2553
|
+
def get_LLMs_prompt(task=None):
|
|
2554
|
+
'''
|
|
2555
|
+
Parameters
|
|
2556
|
+
----------
|
|
2557
|
+
task : TYPE, string: like 'MIP', 'WSD', etc.
|
|
2558
|
+
DESCRIPTION. The default is None.
|
|
2559
|
+
|
|
2560
|
+
Returns
|
|
2561
|
+
-------
|
|
2562
|
+
TYPE, string: like LLM Prompt for Metaphor Analysis Task:
|
|
2563
|
+
Identify all metaphorical expressions in the provided text using MIP (Metaphor Identification Procedure).
|
|
2564
|
+
Categorize each metaphor into one of the following CDA sub-types (with brief justification).
|
|
2565
|
+
DESCRIPTION. The default will return a text promt for specific LLM task.
|
|
2566
|
+
'''
|
|
2567
|
+
|
|
2568
|
+
dic_path=get_library_location("PgsFile")+"/PgsFile/models/prompts"
|
|
2569
|
+
if task is None:
|
|
2570
|
+
user_prompt=get_data_text(find_txt_files_with_keyword(dic_path, "mip")[0])
|
|
2571
|
+
return user_prompt
|
|
2572
|
+
else:
|
|
2573
|
+
user_prompt=get_data_text(find_txt_files_with_keyword(dic_path, task)[0])
|
|
2574
|
+
return user_prompt
|
PgsFile/__init__.py
CHANGED
|
@@ -32,8 +32,8 @@ from .PgsFile import get_env_variable, get_all_env_variables
|
|
|
32
32
|
|
|
33
33
|
# 6. Data cleaning
|
|
34
34
|
from .PgsFile import BigPunctuation, StopTags, Special, yhd
|
|
35
|
-
from .PgsFile import ZhStopWords, EnPunctuation, get_stopwords
|
|
36
|
-
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
|
|
35
|
+
from .PgsFile import ZhStopWords, EnPunctuation, get_stopwords, get_CET_dics, get_BNC_dic
|
|
36
|
+
from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words, usua_tag_set
|
|
37
37
|
from .PgsFile import check_contain_chinese, check_contain_number
|
|
38
38
|
from .PgsFile import replace_chinese_punctuation_with_english
|
|
39
39
|
from .PgsFile import replace_english_punctuation_with_chinese
|
|
@@ -45,11 +45,15 @@ from .PgsFile import strQ2B_raw, strQ2B_words
|
|
|
45
45
|
from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
|
|
46
46
|
from .PgsFile import word_list, batch_word_list
|
|
47
47
|
from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
|
|
48
|
+
from .PgsFile import word_lemmatize, word_POS, word_NER
|
|
49
|
+
from .PgsFile import extract_noun_phrases, get_LLMs_prompt
|
|
50
|
+
from .PgsFile import extract_dependency_relations, extract_dependency_relations_full
|
|
48
51
|
|
|
49
52
|
# 8. Maths
|
|
50
53
|
from .PgsFile import len_rows, check_empty_cells
|
|
51
54
|
from .PgsFile import format_float, decimal_to_percent, Percentage
|
|
52
55
|
from .PgsFile import get_text_length_kb, extract_numbers
|
|
56
|
+
from .PgsFile import calculate_mean_dependency_distance
|
|
53
57
|
|
|
54
58
|
# 9. Visualization
|
|
55
59
|
from .PgsFile import replace_white_with_transparency
|
|
Binary file
|