SinaTools 0.1.35__py2.py3-none-any.whl → 0.1.36__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.35
3
+ Version: 0.1.36
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -1,10 +1,10 @@
1
- SinaTools-0.1.35.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
- sinatools/VERSION,sha256=cVbVTfIguj1zWCurwk_MTvuyWUDhNgp0IfcGYvhdzcY,6
1
+ SinaTools-0.1.36.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
+ sinatools/VERSION,sha256=4WO9ZLWQOVGEf7BUbcCdCnR4_2Fp3iJiMmtiLd4Vzo8,6
3
3
  sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
4
4
  sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
5
  sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
6
6
  sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
7
- sinatools/CLI/DataDownload/download_files.py,sha256=u_DFXbHcIU_4Ub5Y0cL9_p1hL8h6LLWPemn9Al-XFgc,2603
7
+ sinatools/CLI/DataDownload/download_files.py,sha256=EezvbukR3pZ8s6mGZnzTcjsbo3CBDlC0g6KhJWlYp1w,2686
8
8
  sinatools/CLI/morphology/ALMA_multi_word.py,sha256=rmpa72twwIJHme_kpQ1lu3_7y_Jorj70QTvOnQMJRuI,1274
9
9
  sinatools/CLI/morphology/morph_analyzer.py,sha256=HPamEKos_JRYCJv_2q6c12N--da58_JXTno9haww5Ao,3497
10
10
  sinatools/CLI/ner/corpus_entity_extractor.py,sha256=DdvigsDQzko5nJBjzUXlIDqoBMBTVzktjSo7JfEXTIA,4778
@@ -77,13 +77,11 @@ sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma
77
77
  sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
78
78
  sinatools/morphology/morph_analyzer.py,sha256=JOH2UWKNQWo5UzpWNzP9R1D3B3qLSogIiMp8n0N_56o,7177
79
79
  sinatools/ner/__init__.py,sha256=59kLMX6UQhF6JpE10RhaDYC3a2_jiWOIVPuejsoflFE,1050
80
- sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
81
80
  sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
82
81
  sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
83
82
  sinatools/ner/entity_extractor.py,sha256=O2epRwRFUUcQs3SnFIYHVBI4zVhr8hRcj0XJYeby4ts,3588
84
83
  sinatools/ner/helpers.py,sha256=dnOoDY5JMyOLTUWVIZLMt8mBn2IbWlVaqHhQyjs1voo,2343
85
84
  sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
86
- sinatools/ner/relation_extractor.py,sha256=a85xGX6V72fDpJk0GKmmtlWf8S8ezY-2pm5oGc9_ESY,9750
87
85
  sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
88
86
  sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
89
87
  sinatools/ner/data/datasets.py,sha256=lcdDDenFMEKIGYQmfww2dk_9WKWrJO9HtKptaAEsRmY,5064
@@ -104,8 +102,6 @@ sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO839
104
102
  sinatools/synonyms/synonyms_generator.py,sha256=jRd0D3_kn-jYBaZzqY-7oOy0SFjSJ-mjM7JhsySzX58,9037
105
103
  sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
104
  sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
107
- sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7A,27804
108
- sinatools/utils/jaccard.py,sha256=kLIptPNB2VIqnemVve9auyOL1kXHIsCkKCEwxFM8yP4,10114
109
105
  sinatools/utils/parser.py,sha256=qvHdln5R5CAv_0UOJWe0mcp8JCsGqgazoeIIkoALH88,6259
110
106
  sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
111
107
  sinatools/utils/similarity.py,sha256=CgKOJpRAU5UaSjOg-sdZcACCNl9tuKDRwdFAKATCL_w,10762
@@ -115,13 +111,13 @@ sinatools/utils/tokenizer.py,sha256=nyk6lh5-p38wrU62hvh4wg7ni9ammkdqqIgcjbbBxxo,
115
111
  sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
116
112
  sinatools/utils/word_compare.py,sha256=rS2Z74sf7R-7MTXyrFj5miRi2TnSG9OdTDp_qQYuo2Y,28200
117
113
  sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
118
- sinatools/wsd/disambiguator.py,sha256=9ottQn_WwOFX5Trr0Rpg66-Jpaln5yJduFqP6cdOOBA,22616
114
+ sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
119
115
  sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
120
116
  sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
121
- SinaTools-0.1.35.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
122
- SinaTools-0.1.35.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
123
- SinaTools-0.1.35.dist-info/METADATA,sha256=N1gUEgccLIIpfCHthFpI-2HU01LogkZWo1C-1qANx5M,3267
124
- SinaTools-0.1.35.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
125
- SinaTools-0.1.35.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
126
- SinaTools-0.1.35.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
127
- SinaTools-0.1.35.dist-info/RECORD,,
117
+ SinaTools-0.1.36.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
118
+ SinaTools-0.1.36.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
119
+ SinaTools-0.1.36.dist-info/METADATA,sha256=vukmjuNbUETy8EMIkA64uOOwAS5WO5WuWOOMeBoR6ps,3267
120
+ SinaTools-0.1.36.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
121
+ SinaTools-0.1.36.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
122
+ SinaTools-0.1.36.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
123
+ SinaTools-0.1.36.dist-info/RECORD,,
@@ -52,16 +52,17 @@ def main():
52
52
  for file in args.files:
53
53
  print("file: ", file)
54
54
  if file == "wsd":
55
- #download_file(urls["morph"])
56
- #download_file(urls["ner"])
55
+ download_file(urls["morph"])
56
+ download_file(urls["ner"])
57
57
  #download_file(urls["wsd_model"])
58
- download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
59
58
  #download_file(urls["wsd_tokenizer"])
60
- #download_file(urls["one_gram"])
61
- #download_file(urls["five_grams"])
62
- #download_file(urls["four_grams"])
63
- #download_file(urls["three_grams"])
64
- #download_file(urls["two_grams"])
59
+ download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
60
+ download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02")
61
+ download_file(urls["one_gram"])
62
+ download_file(urls["five_grams"])
63
+ download_file(urls["four_grams"])
64
+ download_file(urls["three_grams"])
65
+ download_file(urls["two_grams"])
65
66
  elif file == "synonyms":
66
67
  download_file(urls["graph_l2"])
67
68
  download_file(urls["graph_l3"])
sinatools/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.35
1
+ 0.1.36
@@ -8,10 +8,6 @@ from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
8
8
  from sinatools.morphology.morph_analyzer import analyze
9
9
  from sinatools.ner.entity_extractor import extract
10
10
  from . import glosses_dic
11
- import time
12
- #import concurrent
13
- #import threading
14
- import multiprocessing
15
11
 
16
12
 
17
13
  def distill_entities(entities):
@@ -260,7 +256,7 @@ def find_named_entities(string):
260
256
  return found_entities
261
257
 
262
258
 
263
- def find_glosses_using_ALMA(word, glosses_dic):
259
+ def find_glosses_using_ALMA(word):
264
260
 
265
261
  data = analyze(word, language ='MSA', task ='full', flag="1")
266
262
  Diac_lemma = ""
@@ -306,7 +302,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
306
302
  return my_json
307
303
 
308
304
 
309
- def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner, glosses_dic):
305
+ def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner):
310
306
  output_list = []
311
307
  position = 0
312
308
  while position < len(input_sentence):
@@ -393,7 +389,7 @@ def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemm
393
389
 
394
390
  if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
395
391
  word = input_sentence[position]
396
- word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word, glosses_dic)
392
+ word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word)
397
393
  my_json = {}
398
394
  my_json['word'] = word
399
395
  my_json['concept_count'] = concept_count
@@ -436,95 +432,26 @@ def disambiguate_glosses_main(word, sentence):
436
432
  glosses = word['glosses']
437
433
  Diac_lemma = word['Diac_lemma']
438
434
  Undiac_lemma = word['Undiac_lemma']
439
- start = time.time()
440
- x = disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
441
- end = time.time()
442
- print(f"disambiguate time: {end - start}")
443
- return x
444
-
445
-
446
- def init_resources():
447
- global glosses_dic
448
-
449
-
450
- # Wrapper function for multiprocessing
451
- def disambiguate_glosses_in_parallel(word_and_sentence):
452
- word, sentence = word_and_sentence
453
- return disambiguate_glosses_main(word, sentence)
435
+ return disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
454
436
 
455
437
  def WSD(sentence):
456
- start = time.time()
438
+
457
439
  input_sentence = simple_word_tokenize(sentence)
458
- end = time.time()
459
- print(f"tokenizer time: {end - start}")
460
-
461
- start = time.time()
440
+
462
441
  five_word_lemma = find_five_word_lemma(input_sentence)
463
- end = time.time()
464
- print(f"5grams time: {end - start}")
465
442
 
466
- start = time.time()
467
443
  four_word_lemma = find_four_word_lemma(input_sentence)
468
- end = time.time()
469
- print(f"4grams time: {end - start}")
470
-
471
- start = time.time()
444
+
472
445
  three_word_lemma = find_three_word_lemma(input_sentence)
473
- end = time.time()
474
- print(f"3grams time: {end - start}")
475
-
476
- start = time.time()
446
+
477
447
  two_word_lemma = find_two_word_lemma(input_sentence)
478
- end = time.time()
479
- print(f"2grams time: {end - start}")
480
-
481
- start = time.time()
448
+
482
449
  ner = find_named_entities(" ".join(input_sentence))
483
- end = time.time()
484
- print(f"ner time: {end - start}")
485
-
486
-
487
- start = time.time()
488
- output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner, glosses_dic_shared)
489
- end = time.time()
490
- print(f"lookup time: {end - start}")
491
-
492
- # for word in output_list:
493
- # start = time.time()
494
- # results.append(disambiguate_glosses_main(word, sentence))
495
- # end = time.time()
496
- # print(f"disambiguate time: {end - start}")
497
- # return results
498
-
499
- # with concurrent.futures.ProcessPoolExecutor() as executor:
500
- # results = list(executor.map(lambda word: disambiguate_glosses_main(word, sentence), output_list))
501
- # return results
502
-
503
- # Create and start threads
504
- # for word in output_list:
505
- # thread = threading.Thread(target=worker, args=(word, sentence))
506
- # threads.append(thread)
507
- # thread.start()
508
- #
509
- # for thread in threads:
510
- # thread.join()
511
- #
512
- # return threading_results
513
-
514
- # Number of CPUs
515
- num_cpus = multiprocessing.cpu_count()
516
- print("num_cpus : ", num_cpus)
517
-
518
- # Create a manager to hold shared data
519
- # with multiprocessing.Manager() as manager:
520
- # glosses_dic_shared = manager.dict(glosses_dic)
521
- # with multiprocessing.Pool(num_cpus) as pool:
522
- # arguments = [(word, sentence) for word in output_list]
523
- # results = pool.starmap(disambiguate_glosses_main, arguments)
524
-
525
- with multiprocessing.Pool(initializer=init_resources) as pool:
526
- # Map the list of words to the disambiguation function in parallel
527
- results = pool.map(disambiguate_glosses_in_parallel, [(word, sentence) for word in output_list])
450
+
451
+ output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner)
452
+ results = []
453
+ for word in output_list:
454
+ results.append(disambiguate_glosses_main(word, sentence))
528
455
  return results
529
456
 
530
457
 
@@ -570,8 +497,5 @@ def disambiguate(sentence):
570
497
  content = ["Input is too long"]
571
498
  return content
572
499
  else:
573
- start = time.time()
574
500
  results = WSD(sentence)
575
- end = time.time()
576
- print(f"WSD total time: {end - start}")
577
501
  return results
sinatools/ner/data.py DELETED
@@ -1,124 +0,0 @@
1
- from torch.utils.data import DataLoader
2
- from torchtext.vocab import vocab
3
- from collections import Counter, namedtuple
4
- import logging
5
- import re
6
- import itertools
7
- from sinatools.ner.helpers import load_object
8
- from sinatools.ner.datasets import Token
9
- from sinatools.utils.tokenizers_words import simple_word_tokenize
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- def conll_to_segments(filename):
15
- """
16
- Convert CoNLL files to segments. This return list of segments and each segment is
17
- a list of tuples (token, tag)
18
- :param filename: Path
19
- :return: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
20
- """
21
- segments, segment = list(), list()
22
-
23
- with open(filename, "r") as fh:
24
- for token in fh.read().splitlines():
25
- if not token.strip():
26
- segments.append(segment)
27
- segment = list()
28
- else:
29
- parts = token.split()
30
- token = Token(text=parts[0], gold_tag=parts[1:])
31
- segment.append(token)
32
-
33
- segments.append(segment)
34
-
35
- return segments
36
-
37
-
38
- def parse_conll_files(data_paths):
39
- """
40
- Parse CoNLL formatted files and return list of segments for each file and index
41
- the vocabs and tags across all data_paths
42
- :param data_paths: tuple(Path) - tuple of filenames
43
- :return: tuple( [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i]
44
- [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i+1],
45
- ...
46
- )
47
- List of segments for each dataset and each segment has list of (tokens, tags)
48
- """
49
- vocabs = namedtuple("Vocab", ["tags", "tokens"])
50
- datasets, tags, tokens = list(), list(), list()
51
-
52
- for data_path in data_paths:
53
- dataset = conll_to_segments(data_path)
54
- datasets.append(dataset)
55
- tokens += [token.text for segment in dataset for token in segment]
56
- tags += [token.gold_tag for segment in dataset for token in segment]
57
-
58
- # Flatten list of tags
59
- tags = list(itertools.chain(*tags))
60
-
61
- # Generate vocabs for tags and tokens
62
- tag_vocabs = tag_vocab_by_type(tags)
63
- tag_vocabs.insert(0, vocab(Counter(tags)))
64
- vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
65
- return tuple(datasets), vocabs
66
-
67
-
68
- def tag_vocab_by_type(tags):
69
- vocabs = list()
70
- c = Counter(tags)
71
- tag_names = c.keys()
72
- tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
73
-
74
- for tag_type in tag_types:
75
- r = re.compile(".*-" + tag_type)
76
- t = list(filter(r.match, tags)) + ["O"]
77
- vocabs.append(vocab(Counter(t), specials=["<pad>"]))
78
-
79
- return vocabs
80
-
81
-
82
- def text2segments(text):
83
- """
84
- Convert text to a datasets and index the tokens
85
- """
86
- #dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
87
- list_of_tokens = simple_word_tokenize(text)
88
- dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
89
- tokens = [token.text for segment in dataset for token in segment]
90
-
91
- # Generate vocabs for the tokens
92
- segment_vocab = vocab(Counter(tokens), specials=["UNK"])
93
- return dataset, segment_vocab
94
-
95
-
96
- def get_dataloaders(
97
- datasets, vocab, data_config, batch_size=32, num_workers=0, shuffle=(True, False, False)
98
- ):
99
- """
100
- From the datasets generate the dataloaders
101
- :param datasets: list - list of the datasets, list of list of segments and tokens
102
- :param batch_size: int
103
- :param num_workers: int
104
- :param shuffle: boolean - to shuffle the data or not
105
- :return: List[torch.utils.data.DataLoader]
106
- """
107
- dataloaders = list()
108
-
109
- for i, examples in enumerate(datasets):
110
- data_config["kwargs"].update({"examples": examples, "vocab": vocab})
111
- dataset = load_object("sinatools."+data_config["fn"], data_config["kwargs"])
112
-
113
- dataloader = DataLoader(
114
- dataset=dataset,
115
- shuffle=shuffle[i],
116
- batch_size=batch_size,
117
- num_workers=num_workers,
118
- collate_fn=dataset.collate_fn,
119
- )
120
-
121
- logger.info("%s batches found", len(dataloader))
122
- dataloaders.append(dataloader)
123
-
124
- return dataloaders
@@ -1,201 +0,0 @@
1
- import torch
2
- import json
3
- from urllib.request import Request, urlopen
4
- from sinatools.ner.entity_extractor import extract
5
- from . import pipe
6
-
7
-
8
- # ============================ Extract entities and their types ========================
9
- def jsons_to_list_of_lists(json_list):
10
- return [[d['token'], d['tags']] for d in json_list]
11
-
12
- def entities_and_types(sentence):
13
- output_list = jsons_to_list_of_lists(extract(sentence))
14
- json_short = distill_entities(output_list)
15
-
16
- entities = {}
17
- for entity in json_short:
18
- name = entity[0]
19
- entity_type = entity[1]
20
- entities[name] = entity_type
21
-
22
- return entities
23
-
24
- def distill_entities(entities):
25
- # This is list that we put the output what we need
26
- list_output = list()
27
-
28
- # This line go to sort function and save the output to temp_entities
29
- temp_entities = sortTags(entities)
30
-
31
- # This list help us to make the output,
32
- temp_list = list()
33
-
34
- # initlize the temp_list
35
- temp_list.append(["", "", 0, 0])
36
- word_position = 0
37
-
38
- # For each entity, convert ibo to distllir list.
39
- for entity in temp_entities:
40
- # This is counter tag of this entity
41
- counter_tag = 0
42
- # For each tag
43
- for tag in str(entity[1]).split():
44
- # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
45
- if counter_tag >= len(temp_list):
46
- temp_list.append(["", "", 0, 0])
47
-
48
- # If tag equal O and word postion of this tag is not equal zero then it will add all
49
- # not empty eliment of temp list in output list
50
- if "O" == tag and word_position != 0:
51
- for j in range(0, len(temp_list)):
52
- if temp_list[j][1] != "":
53
- list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
54
- temp_list[j][0] = ""
55
- temp_list[j][1] = ""
56
- temp_list[j][2] = word_position
57
- temp_list[j][3] = word_position
58
- # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
59
- # of the split its B
60
- elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
61
- # if the temp_list of counter is not empty then it will append in output list and hten it will
62
- # initilize by new string and tag in templist of counter
63
- if temp_list[counter_tag][1] != "":
64
- list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
65
- temp_list[counter_tag][0] = str(entity[0]) + " "
66
- temp_list[counter_tag][1] = str(tag).split("-")[1]
67
- temp_list[counter_tag][2] = word_position
68
- temp_list[counter_tag][3] = word_position
69
-
70
- # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
71
- # of the split its O
72
- elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
73
- # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
74
- # then will complete if not it will save in output list and cheak another
75
- for j in range(counter_tag,len(temp_list)):
76
- if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
77
- temp_list[j][0] += str(entity[0]) + " "
78
- temp_list[j][3] += 1
79
- break
80
- else:
81
- if temp_list[j][1] != "":
82
- list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
83
- temp_list[j][0] = ""
84
- temp_list[j][1] = ""
85
- temp_list[j][2] = word_position
86
- temp_list[j][3] = word_position
87
- counter_tag += 1
88
- word_position += 1
89
- # For each temp_list, at the end of the previous loop, there will be some
90
- # values in this list, we should save it to the output list
91
- for j in range(0, len(temp_list)):
92
- if temp_list[j][1] != "":
93
- list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
94
- return sorted(list_output, key=lambda x: (x[2]))
95
-
96
- def sortTags(entities):
97
- temp_entities = entities
98
- temp_counter = 0
99
- # For each entity, this loop will sort each tag of entitiy, first it will check if the
100
- # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
101
- for entity in temp_entities:
102
- tags = entity[1].split()
103
- for tag in tags:
104
- # if the counter is not 0 then, will complete
105
- if temp_counter != 0:
106
- # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
107
- # count how many tag in previous tags
108
- if "I-" == tag[0:2]:
109
- counter_of_this_tag = 0
110
- counter_of_previous_tag = 0
111
- for word in tags:
112
- if tag.split("-")[1] in word:
113
- counter_of_this_tag+=1
114
- for word in temp_entities[temp_counter-1][1].split():
115
- if tag.split("-")[1] in word:
116
- counter_of_previous_tag+=1
117
- # if the counter of previous tag is bigger than counter of this tag, then we
118
- # need to add I-tag in this tags
119
- if counter_of_previous_tag > counter_of_this_tag:
120
- tags.append("I-"+tag.split("-")[1])
121
- # Sort the tags
122
- tags.sort()
123
- # Need to revers the tags because it should begins with I
124
- tags.reverse()
125
- # If the counter is not 0 then we can complete
126
- if temp_counter != 0:
127
- this_tags = tags
128
- previous_tags = temp_entities[temp_counter - 1][1].split()
129
- sorted_tags = list()
130
-
131
- # Check if the this tag is not O and previous tags is not O, then will complete,
132
- # if not then it will ignor this tag
133
- if "O" not in this_tags and "O" not in previous_tags:
134
- index = 0
135
- #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
136
- for i in previous_tags:
137
- j = 0
138
- while this_tags and j < len(this_tags):
139
- if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
140
- sorted_tags.insert(index, this_tags.pop(j))
141
- break
142
- elif this_tags[j][0:2] == "B-":
143
- break
144
- j += 1
145
- index += 1
146
- sorted_tags += this_tags
147
- tags = sorted_tags
148
- str_tag = " "
149
- str_tag = str_tag.join(tags)
150
- str_tag = str_tag.strip()
151
- temp_entities[temp_counter][1] = str_tag
152
- temp_counter += 1
153
- return temp_entities
154
-
155
- # ============= Prepare Templates and Catergorize Extracted Entities ================
156
- temp03={'location':'مكان حدوث','agent':'أحد المتأثرين في','happened at':'تاريخ حدوث'}
157
- categories = {
158
- 'agent': ['PERS', 'NORP', 'OCC', 'ORG'],
159
- 'location': ['LOC', 'FAC', 'GPE'],
160
- 'happened at': ['DATE', 'TIME']
161
- }
162
-
163
- def get_entity_category(entity_type, categories):
164
- for category, types in categories.items():
165
- if entity_type in types:
166
- return category
167
- return None
168
-
169
-
170
- # ============ Extract entities, their types and categorize them ===============
171
- def relation_extraction(sentence):
172
- #test_sentence="صورة إعتقال طفل فلسطيني خلال انتفاضة الأقصى ."
173
- entities=entities_and_types(sentence)
174
-
175
- event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type == 'EVENT']
176
- arg_event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type != 'EVENT']
177
-
178
- output_list=[]
179
-
180
- for i in event_indices:
181
- event_entity=list(entities.keys())[i]
182
- for j in arg_event_indices:
183
- arg_name= list(entities.keys())[j]
184
- arg_type=entities[arg_name]
185
- category = get_entity_category(arg_type, categories)
186
-
187
- if category in temp03:
188
- relation_sentence=f"[CLS] {sentence} [SEP] {event_entity} {temp03[category]} {arg_name}"
189
- predicted_relation=pipe(relation_sentence)
190
- score = predicted_relation[0][0]['score']
191
- if score > 0.50:
192
- #print(f"Event:{event_entity} Relation:{category} Argument:{arg_name}\n")
193
- #output_list.append([{event_entity} ,{category}, {arg_name}])
194
- output_list.append(f"Event:{event_entity}, Relation:{category}, Argument:{arg_name}")
195
-
196
- else:
197
- #print(f"Event:{event_entity} Relation:No relation Argument:{arg_name}\n")
198
- #output_list.append([{event_entity} ,'No relation', {arg_name}])
199
- output_list.append(f"Event:{event_entity}, Relation:No relation, Argument:{arg_name}")
200
-
201
- return output_list