PgsFile 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

PgsFile/PgsFile.py CHANGED
@@ -2331,3 +2331,244 @@ def delete_permanent_environment_variable(variable_name, system_wide=False):
2331
2331
  except Exception as e:
2332
2332
  print(f'An error occurred: {e}')
2333
2333
 
2334
+
2335
+ def calculate_mean_dependency_distance(spacy_doc):
2336
+ """
2337
+ Calculate the mean dependency distance for tokens in a spaCy Doc object.
2338
+
2339
+ The dependency distance is the absolute difference in positions between a token
2340
+ and its syntactic head. This function computes the average of these distances
2341
+ for all tokens in the Doc object, excluding punctuation and the root token.
2342
+
2343
+ Parameters:
2344
+ spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
2345
+
2346
+ Returns:
2347
+ float: The mean dependency distance. Returns 0 if there are no valid tokens to analyze.
2348
+ """
2349
+
2350
+ doc=spacy_doc
2351
+ total_distance = 0
2352
+ count = 0
2353
+
2354
+ for token in doc:
2355
+ if token.dep_ not in ("punct", "ROOT"):
2356
+ distance = abs(list(doc).index(token.head) - list(doc).index(token))
2357
+ total_distance += distance
2358
+ count += 1
2359
+
2360
+ if count == 0:
2361
+ return 0
2362
+
2363
+ mean_distance = total_distance / count
2364
+ return mean_distance
2365
+
2366
+ def word_lemmatize(spacy_doc):
2367
+ """
2368
+ Lemmatize the words in a spaCy Doc object and return the lemmatized text.
2369
+
2370
+ This function processes each token in the Doc object, replacing it with its lemma
2371
+ unless the lemma is '-PRON-', in which case the original text of the token is used.
2372
+ The resulting lemmatized words are joined into a single string.
2373
+
2374
+ Parameters:
2375
+ spacy_doc (spacy.tokens.Doc): The spaCy Doc object to lemmatize.
2376
+
2377
+ Returns:
2378
+ str: The lemmatized text as a single string.
2379
+ """
2380
+ doc = spacy_doc
2381
+ text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in doc])
2382
+ return text
2383
+
2384
+ def word_NER(spacy_doc):
2385
+ """
2386
+ Extract Named Entities from a spaCy Doc object.
2387
+
2388
+ This function processes the Doc object to identify and extract named entities,
2389
+ returning a list of tuples where each tuple contains the entity text and its label.
2390
+
2391
+ Parameters:
2392
+ spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
2393
+
2394
+ Returns:
2395
+ list of tuples: A list of tuples where each tuple contains the entity text and its label.
2396
+ """
2397
+ doc = spacy_doc
2398
+ entities = [(ent.text, ent.label_) for ent in doc.ents]
2399
+ return entities
2400
+
2401
+ def word_POS(spacy_doc):
2402
+ """
2403
+ Extract Part-Of-Speech (POS) tags from a spaCy Doc object.
2404
+
2405
+ This function processes the Doc object to identify and extract the POS tags for each token,
2406
+ returning a list of tuples where each tuple contains the token text and its corresponding POS tag.
2407
+
2408
+ Parameters:
2409
+ spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
2410
+
2411
+ Returns:
2412
+ list of tuples: A list of tuples where each tuple contains the token text and its POS tag.
2413
+ """
2414
+ doc = spacy_doc
2415
+ pos_tags = [(token.text, token.pos_) for token in doc]
2416
+ return pos_tags
2417
+
2418
+ def extract_noun_phrases(spacy_doc):
2419
+ """
2420
+ Extract noun phrases from a spaCy Doc object.
2421
+
2422
+ This function processes the Doc object to identify and extract noun phrases,
2423
+ returning a list of strings where each string is a noun phrase.
2424
+
2425
+ Parameters:
2426
+ spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
2427
+
2428
+ Returns:
2429
+ list of str: A list of noun phrases extracted from the Doc object.
2430
+ """
2431
+ doc = spacy_doc
2432
+ noun_phrases = [chunk.text for chunk in doc.noun_chunks]
2433
+ return noun_phrases
2434
+
2435
+ def extract_dependency_relations(spacy_doc):
2436
+ """
2437
+ Extract the dependency relations for each word in a spaCy Doc object.
2438
+
2439
+ This function processes the Doc object to identify and extract the dependency relations
2440
+ for each token, returning a list of tuples where each tuple contains the token text,
2441
+ its dependency relation, and the text of its syntactic head.
2442
+
2443
+ Parameters:
2444
+ spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
2445
+
2446
+ Returns:
2447
+ list of tuples: A list of tuples where each tuple contains the token text,
2448
+ its dependency relation, and the text of its syntactic head.
2449
+ """
2450
+ doc = spacy_doc
2451
+ dependency_relations = [(token.text, token.dep_, token.head.text) for token in doc]
2452
+ return dependency_relations
2453
+
2454
+ def extract_dependency_relations_full(spacy_doc):
2455
+ """
2456
+ Extract comprehensive dependency relations for each word in a spaCy Doc object.
2457
+
2458
+ This function processes the Doc object to identify and extract detailed dependency relations
2459
+ for each token. It returns a list of tuples where each tuple contains the token text,
2460
+ its lemmatized form, its part-of-speech (POS) tag, its dependency relation, the text of its
2461
+ syntactic head, and a list of its child tokens.
2462
+
2463
+ Parameters:
2464
+ spacy_doc (spacy.tokens.Doc): The spaCy Doc object to analyze.
2465
+
2466
+ Returns:
2467
+ list of tuples: A list of tuples where each tuple contains:
2468
+ - The token text
2469
+ - The lemmatized form of the token
2470
+ - The POS tag of the token
2471
+ - The dependency relation of the token
2472
+ - The text of the token's syntactic head
2473
+ - A list of the text of the token's child tokens
2474
+ """
2475
+ doc = spacy_doc
2476
+ dependency_relations = [(token.text, token.lemma_, token.pos_, token.dep_, token.head.text, [child.text for child in token.children]) for token in doc]
2477
+ return dependency_relations
2478
+
2479
+ usua_tag_set = {
2480
+ "A": "General & Abstract Terms",
2481
+ "B": "The Body & the Individual",
2482
+ "C": "Arts & Crafts",
2483
+ "E": "Emotional Actions, States & Processes",
2484
+ "F": "Food & Farming",
2485
+ "G": "Government & the Public Domain",
2486
+ "H": "Architecture, Building, Houses & the Home",
2487
+ "I": "Money & Commerce",
2488
+ "K": "Entertainment, Sports & Games",
2489
+ "L": "Life & Living Things",
2490
+ "M": "Movement, Location, Travel & Transport",
2491
+ "N": "Numbers & Measurement",
2492
+ "O": "Substances, Materials, Objects & Equipment",
2493
+ "P": "Education",
2494
+ "Q": "Linguistic Actions, States & Processes",
2495
+ "S": "Social Actions, States & Processes",
2496
+ "T": "Time",
2497
+ "W": "The World & Our Environment",
2498
+ "X": "Psychological Actions, States & Processes",
2499
+ "Y": "Science & Technology",
2500
+ "Z": "Names & Grammatical Words"
2501
+ }
2502
+
2503
+
2504
+ def get_CET_dics(name=None):
2505
+ '''
2506
+ Parameters
2507
+ ----------
2508
+ name : TYPE, string: like 'CET-4', 'CET-6', etc.
2509
+ DESCRIPTION. The default is None.
2510
+
2511
+ Returns
2512
+ -------
2513
+ TYPE, list: like ['a', 'an', 'abandon', 'able', 'ability', 'aboard', 'abolish', 'abolition', 'about', 'above', 'abroad', 'absent', 'absence', 'absolute', 'absorb']
2514
+ DESCRIPTION. The default will return a list of English CET (China's College English Test band 4 & 6) words.
2515
+ '''
2516
+
2517
+ dic_path=get_library_location("PgsFile")+"/PgsFile/models/dics"
2518
+ if name is None:
2519
+ cet_words=get_data_lines(find_txt_files_with_keyword(dic_path, "cet-4")[0])
2520
+ return cet_words
2521
+ else:
2522
+ cet_words=get_data_lines(find_txt_files_with_keyword(dic_path, name)[0])
2523
+ return cet_words
2524
+
2525
+ def get_BNC_dic():
2526
+ '''
2527
+ Returns
2528
+ -------
2529
+ TYPE, pandas dataframe:
2530
+ List ... Total frequency
2531
+ 0 1k ... 2525253
2532
+ 1 1k ... 47760
2533
+ 2 1k ... 192168
2534
+ 3 1k ... 25370
2535
+ 4 1k ... 9284
2536
+ ... ... ...
2537
+ 24997 25k ... 0
2538
+ 24998 25k ... 0
2539
+ 24999 25k ... 9
2540
+ 25000 25k ... 4
2541
+ 25001 25k ... 9
2542
+
2543
+ [25002 rows x 4 columns]
2544
+ DESCRIPTION. The default will return a dataframe of the most commonly used English word list based on the BNC-COCA corpus.
2545
+ '''
2546
+ import pandas as pd
2547
+ inter=get_library_location("PgsFile")+"/PgsFile/models/dics"
2548
+ dic_path=get_full_path(inter, "BNC_COCA_lists.xlsx")
2549
+ print(dic_path)
2550
+ df=pd.read_excel(dic_path)
2551
+ return df
2552
+
2553
+ def get_LLMs_prompt(task=None):
2554
+ '''
2555
+ Parameters
2556
+ ----------
2557
+ task : TYPE, string: like 'MIP', 'WSD', etc.
2558
+ DESCRIPTION. The default is None.
2559
+
2560
+ Returns
2561
+ -------
2562
+ TYPE, string: like LLM Prompt for Metaphor Analysis Task:
2563
+ Identify all metaphorical expressions in the provided text using MIP (Metaphor Identification Procedure).
2564
+ Categorize each metaphor into one of the following CDA sub-types (with brief justification).
2565
+ DESCRIPTION. The default will return a text promt for specific LLM task.
2566
+ '''
2567
+
2568
+ dic_path=get_library_location("PgsFile")+"/PgsFile/models/prompts"
2569
+ if task is None:
2570
+ user_prompt=get_data_text(find_txt_files_with_keyword(dic_path, "mip")[0])
2571
+ return user_prompt
2572
+ else:
2573
+ user_prompt=get_data_text(find_txt_files_with_keyword(dic_path, task)[0])
2574
+ return user_prompt
PgsFile/__init__.py CHANGED
@@ -32,8 +32,8 @@ from .PgsFile import get_env_variable, get_all_env_variables
32
32
 
33
33
  # 6. Data cleaning
34
34
  from .PgsFile import BigPunctuation, StopTags, Special, yhd
35
- from .PgsFile import ZhStopWords, EnPunctuation, get_stopwords
36
- from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words
35
+ from .PgsFile import ZhStopWords, EnPunctuation, get_stopwords, get_CET_dics, get_BNC_dic
36
+ from .PgsFile import nltk_en_tags, nltk_tag_mapping, thulac_tags, ICTCLAS2008, LangCodes, pgs_abbres_words, usua_tag_set
37
37
  from .PgsFile import check_contain_chinese, check_contain_number
38
38
  from .PgsFile import replace_chinese_punctuation_with_english
39
39
  from .PgsFile import replace_english_punctuation_with_chinese
@@ -45,11 +45,15 @@ from .PgsFile import strQ2B_raw, strQ2B_words
45
45
  from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity
46
46
  from .PgsFile import word_list, batch_word_list
47
47
  from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
48
+ from .PgsFile import word_lemmatize, word_POS, word_NER
49
+ from .PgsFile import extract_noun_phrases, get_LLMs_prompt
50
+ from .PgsFile import extract_dependency_relations, extract_dependency_relations_full
48
51
 
49
52
  # 8. Maths
50
53
  from .PgsFile import len_rows, check_empty_cells
51
54
  from .PgsFile import format_float, decimal_to_percent, Percentage
52
55
  from .PgsFile import get_text_length_kb, extract_numbers
56
+ from .PgsFile import calculate_mean_dependency_distance
53
57
 
54
58
  # 9. Visualization
55
59
  from .PgsFile import replace_white_with_transparency
Binary file