SinaTools 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
- SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
- SinaTools-0.1.1.dist-info/LICENSE +22 -0
- SinaTools-0.1.1.dist-info/METADATA +72 -0
- SinaTools-0.1.1.dist-info/RECORD +122 -0
- SinaTools-0.1.1.dist-info/WHEEL +6 -0
- SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.1.dist-info/top_level.txt +1 -0
- nlptools/CLI/DataDownload/download_files.py +71 -0
- nlptools/CLI/arabiner/bin/infer.py +117 -0
- nlptools/CLI/arabiner/bin/infer2.py +81 -0
- nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
- nlptools/CLI/morphology/morph_analyzer.py +91 -0
- nlptools/CLI/salma/salma_tools.py +68 -0
- nlptools/CLI/utils/__init__.py +0 -0
- nlptools/CLI/utils/arStrip.py +99 -0
- nlptools/CLI/utils/corpus_tokenizer.py +74 -0
- nlptools/CLI/utils/implication.py +92 -0
- nlptools/CLI/utils/jaccard.py +96 -0
- nlptools/CLI/utils/latin_remove.py +51 -0
- nlptools/CLI/utils/remove_Punc.py +53 -0
- nlptools/CLI/utils/sentence_tokenizer.py +90 -0
- nlptools/CLI/utils/text_transliteration.py +77 -0
- nlptools/DataDownload/__init__.py +0 -0
- nlptools/DataDownload/downloader.py +185 -0
- nlptools/VERSION +1 -0
- nlptools/__init__.py +5 -0
- nlptools/arabert/__init__.py +1 -0
- nlptools/arabert/arabert/__init__.py +14 -0
- nlptools/arabert/arabert/create_classification_data.py +260 -0
- nlptools/arabert/arabert/create_pretraining_data.py +534 -0
- nlptools/arabert/arabert/extract_features.py +444 -0
- nlptools/arabert/arabert/lamb_optimizer.py +158 -0
- nlptools/arabert/arabert/modeling.py +1027 -0
- nlptools/arabert/arabert/optimization.py +202 -0
- nlptools/arabert/arabert/run_classifier.py +1078 -0
- nlptools/arabert/arabert/run_pretraining.py +593 -0
- nlptools/arabert/arabert/run_squad.py +1440 -0
- nlptools/arabert/arabert/tokenization.py +414 -0
- nlptools/arabert/araelectra/__init__.py +1 -0
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
- nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
- nlptools/arabert/araelectra/configure_finetuning.py +172 -0
- nlptools/arabert/araelectra/configure_pretraining.py +143 -0
- nlptools/arabert/araelectra/finetune/__init__.py +14 -0
- nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
- nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
- nlptools/arabert/araelectra/finetune/scorer.py +54 -0
- nlptools/arabert/araelectra/finetune/task.py +74 -0
- nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
- nlptools/arabert/araelectra/flops_computation.py +215 -0
- nlptools/arabert/araelectra/model/__init__.py +14 -0
- nlptools/arabert/araelectra/model/modeling.py +1029 -0
- nlptools/arabert/araelectra/model/optimization.py +193 -0
- nlptools/arabert/araelectra/model/tokenization.py +355 -0
- nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
- nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
- nlptools/arabert/araelectra/run_finetuning.py +323 -0
- nlptools/arabert/araelectra/run_pretraining.py +469 -0
- nlptools/arabert/araelectra/util/__init__.py +14 -0
- nlptools/arabert/araelectra/util/training_utils.py +112 -0
- nlptools/arabert/araelectra/util/utils.py +109 -0
- nlptools/arabert/aragpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
- nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
- nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
- nlptools/arabert/aragpt2/grover/__init__.py +0 -0
- nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
- nlptools/arabert/aragpt2/grover/modeling.py +803 -0
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
- nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
- nlptools/arabert/aragpt2/grover/utils.py +234 -0
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
- nlptools/arabert/preprocess.py +818 -0
- nlptools/arabiner/__init__.py +0 -0
- nlptools/arabiner/bin/__init__.py +14 -0
- nlptools/arabiner/bin/eval.py +87 -0
- nlptools/arabiner/bin/infer.py +91 -0
- nlptools/arabiner/bin/process.py +140 -0
- nlptools/arabiner/bin/train.py +221 -0
- nlptools/arabiner/data/__init__.py +1 -0
- nlptools/arabiner/data/datasets.py +146 -0
- nlptools/arabiner/data/transforms.py +118 -0
- nlptools/arabiner/nn/BaseModel.py +22 -0
- nlptools/arabiner/nn/BertNestedTagger.py +34 -0
- nlptools/arabiner/nn/BertSeqTagger.py +17 -0
- nlptools/arabiner/nn/__init__.py +3 -0
- nlptools/arabiner/trainers/BaseTrainer.py +117 -0
- nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
- nlptools/arabiner/trainers/BertTrainer.py +163 -0
- nlptools/arabiner/trainers/__init__.py +3 -0
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +124 -0
- nlptools/arabiner/utils/helpers.py +151 -0
- nlptools/arabiner/utils/metrics.py +69 -0
- nlptools/environment.yml +227 -0
- nlptools/install_env.py +13 -0
- nlptools/morphology/ALMA_multi_word.py +34 -0
- nlptools/morphology/__init__.py +52 -0
- nlptools/morphology/charsets.py +60 -0
- nlptools/morphology/morph_analyzer.py +170 -0
- nlptools/morphology/settings.py +8 -0
- nlptools/morphology/tokenizers_words.py +19 -0
- nlptools/nlptools.py +1 -0
- nlptools/salma/__init__.py +12 -0
- nlptools/salma/settings.py +31 -0
- nlptools/salma/views.py +459 -0
- nlptools/salma/wsd.py +126 -0
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/corpus_tokenizer.py +73 -0
- nlptools/utils/implication.py +662 -0
- nlptools/utils/jaccard.py +247 -0
- nlptools/utils/parser.py +147 -0
- nlptools/utils/readfile.py +3 -0
- nlptools/utils/sentence_tokenizer.py +53 -0
- nlptools/utils/text_transliteration.py +232 -0
- nlptools/utils/utils.py +2 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The sina_remove_latin tool performs delete latin characters from the input text.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running sina_remove_latin --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
Usage:
|
13
|
+
sina_remove_latin --text=TEXT
|
14
|
+
sina_remove_latin --file "path/to/your/file.txt"
|
15
|
+
|
16
|
+
Examples:
|
17
|
+
---------
|
18
|
+
|
19
|
+
.. code-block:: none
|
20
|
+
|
21
|
+
sina_remove_punctuation --text "123test"
|
22
|
+
|
23
|
+
sina_remove_punctuation --file "path/to/your/file.txt"
|
24
|
+
|
25
|
+
Note:
|
26
|
+
-----
|
27
|
+
|
28
|
+
.. code-block:: none
|
29
|
+
|
30
|
+
- This tool is specific to Arabic text, as it focuses on Arabic linguistic elements.
|
31
|
+
- Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
|
32
|
+
- This tool for latin characters, if the input text is an Arabic characters or numbers the output will be the same input
|
33
|
+
|
34
|
+
"""
|
35
|
+
|
36
|
+
import argparse
|
37
|
+
from nlptools.utils.parser import remove_latin
|
38
|
+
|
39
|
+
|
40
|
+
def main():
|
41
|
+
parser = argparse.ArgumentParser(description='remove latin characters from the text')
|
42
|
+
|
43
|
+
parser.add_argument('--text', type=str, required=True, help='The input text')
|
44
|
+
args = parser.parse_args()
|
45
|
+
result = remove_latin(args.text)
|
46
|
+
|
47
|
+
print(result)
|
48
|
+
if __name__ == '__main__':
|
49
|
+
main()
|
50
|
+
|
51
|
+
#sina_remove_latin --text "123test"
|
@@ -0,0 +1,53 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The sina_remove_punctuation tool performs delete punctuation marks from the input text.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running sina_remove_punctuation --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
Usage:
|
13
|
+
sina_remove_punctuation --text=TEXT
|
14
|
+
sina_remove_punctuation --file "path/to/your/file.txt"
|
15
|
+
|
16
|
+
Examples:
|
17
|
+
---------
|
18
|
+
|
19
|
+
.. code-block:: none
|
20
|
+
|
21
|
+
sina_remove_punctuation --text "te%s@t...!!?"
|
22
|
+
|
23
|
+
sina_remove_punctuation --file "path/to/your/file.txt"
|
24
|
+
|
25
|
+
Note:
|
26
|
+
-----
|
27
|
+
|
28
|
+
.. code-block:: none
|
29
|
+
|
30
|
+
- This tool is specific to Arabic text, as it focuses on Arabic linguistic elements.
|
31
|
+
- Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
|
32
|
+
"""
|
33
|
+
|
34
|
+
import argparse
|
35
|
+
from nlptools.utils.parser import remove_punctuation
|
36
|
+
#from nlptools.utils.parser import read_file
|
37
|
+
#from nlptools.utils.parser import write_file
|
38
|
+
|
39
|
+
|
40
|
+
def main():
|
41
|
+
parser = argparse.ArgumentParser(description='remove punctuation marks from the text')
|
42
|
+
|
43
|
+
parser.add_argument('--text',required=True,help="input text")
|
44
|
+
# parser.add_argument('myFile', type=argparse.FileType('r'),help='Input file csv')
|
45
|
+
args = parser.parse_args()
|
46
|
+
result = remove_punctuation(args.text)
|
47
|
+
|
48
|
+
print(result)
|
49
|
+
if __name__ == '__main__':
|
50
|
+
main()
|
51
|
+
|
52
|
+
#sina_remove_punctuation --text "your text"
|
53
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
|
5
|
+
The sina_sentence_tokenize tool allows you to tokenize text into sentences using the SinaTools utility. It provides
|
6
|
+
flexibility in tokenizing at different punctuation marks, including dots, question marks, and exclamation marks. It also
|
7
|
+
allows tokenization at new lines.
|
8
|
+
|
9
|
+
Usage:
|
10
|
+
------
|
11
|
+
Below is the usage information that can be generated by running sina_sentence_tokenize --help.
|
12
|
+
|
13
|
+
.. code-block:: none
|
14
|
+
|
15
|
+
Usage:
|
16
|
+
sina_sentence_tokenize --text=TEXT [options]
|
17
|
+
|
18
|
+
sina_sentence_tokenize --file=FILE [options]
|
19
|
+
|
20
|
+
.. code-block:: none
|
21
|
+
|
22
|
+
Options:
|
23
|
+
--text TEXT
|
24
|
+
Text to be tokenized into sentences.
|
25
|
+
--file FILE
|
26
|
+
File containing the text to be tokenized into sentences
|
27
|
+
--dot
|
28
|
+
Tokenize at dots.
|
29
|
+
--new_line
|
30
|
+
Tokenize at new lines.
|
31
|
+
--question_mark
|
32
|
+
Tokenize at question marks.
|
33
|
+
--exclamation_mark
|
34
|
+
Tokenize at exclamation marks.
|
35
|
+
|
36
|
+
Examples:
|
37
|
+
---------
|
38
|
+
|
39
|
+
.. code-block:: none
|
40
|
+
|
41
|
+
sina_sentence_tokenize --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
|
42
|
+
|
43
|
+
sina_sentence_tokenize --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
|
44
|
+
|
45
|
+
Note:
|
46
|
+
-----
|
47
|
+
|
48
|
+
.. code-block:: none
|
49
|
+
|
50
|
+
- The tokenization options allow for a customized experience. You can choose any combination of the options, or even none
|
51
|
+
- of them, to achieve the desired sentence tokenization behavior. If no tokenization options are provided, the tool will
|
52
|
+
- use default settings as implemented in the underlying `sent_tokenize` function of SinaTools.
|
53
|
+
|
54
|
+
"""
|
55
|
+
import argparse
|
56
|
+
from nlptools.utils.sentence_tokenizer import sent_tokenize
|
57
|
+
from nlptools.utils.readfile import read_file
|
58
|
+
|
59
|
+
def main():
|
60
|
+
parser = argparse.ArgumentParser(description='Sentence Tokenization using SinaTools')
|
61
|
+
|
62
|
+
# Adding arguments for the text, file, and tokenization options
|
63
|
+
parser.add_argument('--text', type=str, help='Text to be tokenized into sentences')
|
64
|
+
parser.add_argument('--file', type=str, help='File containing the text to be tokenized into sentences')
|
65
|
+
parser.add_argument('--dot', action='store_true', help='Tokenize at dots')
|
66
|
+
parser.add_argument('--new_line', action='store_true', help='Tokenize at new lines')
|
67
|
+
parser.add_argument('--question_mark', action='store_true', help='Tokenize at question marks')
|
68
|
+
parser.add_argument('--exclamation_mark', action='store_true', help='Tokenize at exclamation marks')
|
69
|
+
|
70
|
+
args = parser.parse_args()
|
71
|
+
|
72
|
+
# Check if either text or file is provided
|
73
|
+
if args.text is None and args.file is None:
|
74
|
+
print("Either --text or --file argument must be provided.")
|
75
|
+
return
|
76
|
+
|
77
|
+
text_content = args.text if args.text else read_file(args.file)
|
78
|
+
|
79
|
+
# Perform sentence tokenization
|
80
|
+
sentences = sent_tokenize(" ".join(text_content), dot=args.dot, new_line=args.new_line,
|
81
|
+
question_mark=args.question_mark, exclamation_mark=args.exclamation_mark)
|
82
|
+
|
83
|
+
# Print each sentence in a new line
|
84
|
+
for sentence in sentences:
|
85
|
+
print(sentence)
|
86
|
+
|
87
|
+
if __name__ == '__main__':
|
88
|
+
main()
|
89
|
+
#sina_sentence_tokenize --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
|
90
|
+
#sina_sentence_tokenize --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
|
@@ -0,0 +1,77 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
|
5
|
+
The sina_transliterate tool allows you to transliterate text using the SinaTools' utility. This command-line utility
|
6
|
+
takes in a text and a desired schema, and outputs the transliterated text.
|
7
|
+
|
8
|
+
Usage:
|
9
|
+
------
|
10
|
+
Below is the usage information that can be generated by running sina_transliterate --help.
|
11
|
+
|
12
|
+
Usage:
|
13
|
+
------
|
14
|
+
|
15
|
+
.. code-block:: none
|
16
|
+
|
17
|
+
sina_transliterate --text=TEXT --schema=SCHEMA
|
18
|
+
|
19
|
+
sina_transliterate --file=FILE --schema=SCHEMA
|
20
|
+
|
21
|
+
Options:
|
22
|
+
--------
|
23
|
+
|
24
|
+
.. code-block:: none
|
25
|
+
|
26
|
+
--text TEXT
|
27
|
+
Text to be transliterated.
|
28
|
+
--schema SCHEMA
|
29
|
+
Transliteration schema to be used, which is bw2ar or ar2bw.
|
30
|
+
|
31
|
+
Examples:
|
32
|
+
---------
|
33
|
+
|
34
|
+
.. code-block:: none
|
35
|
+
|
36
|
+
sina_transliterate --text "klmp" --schema "bw2ar"
|
37
|
+
sina_transliterate --file "path/to/your/file.txt" --schema "ar2bw"
|
38
|
+
|
39
|
+
Note:
|
40
|
+
-----
|
41
|
+
|
42
|
+
.. code-block:: none
|
43
|
+
|
44
|
+
For available transliteration schemas and more details, please refer to the SinaTools' documentation or the source code
|
45
|
+
of the function `perform_transliteration`.
|
46
|
+
|
47
|
+
"""
|
48
|
+
import argparse
|
49
|
+
from nlptools.utils.text_transliteration import perform_transliteration
|
50
|
+
from nlptools.utils.readfile import read_file
|
51
|
+
|
52
|
+
def main():
|
53
|
+
parser = argparse.ArgumentParser(description='Perform text transliteration using SinaTools')
|
54
|
+
|
55
|
+
# Adding arguments for the text, file, and schema
|
56
|
+
parser.add_argument('--text', type=str, help='Text to be transliterated')
|
57
|
+
parser.add_argument('--file', type=str, help='File containing the text to be transliterated')
|
58
|
+
parser.add_argument('--schema', type=str, required=True, help='Transliteration schema to be used')
|
59
|
+
|
60
|
+
args = parser.parse_args()
|
61
|
+
|
62
|
+
# Check if either text or file is provided
|
63
|
+
if args.text is None and args.file is None:
|
64
|
+
print("Either --text or --file argument must be provided.")
|
65
|
+
return
|
66
|
+
|
67
|
+
text_content = args.text if args.text else " ".join(read_file(args.file))
|
68
|
+
# Perform transliteration
|
69
|
+
result = perform_transliteration(text_content, args.schema)
|
70
|
+
|
71
|
+
print(result)
|
72
|
+
|
73
|
+
if __name__ == '__main__':
|
74
|
+
main()
|
75
|
+
|
76
|
+
#sina_transliterate --text "example text" --schema "bw2ar"
|
77
|
+
#sina_transliterate --file "path/to/your/file.txt" --schema "bw2ar"
|
File without changes
|
@@ -0,0 +1,185 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
from pathlib import Path
|
4
|
+
import requests
|
5
|
+
import zipfile
|
6
|
+
from tqdm import tqdm
|
7
|
+
import tarfile
|
8
|
+
urls = {
|
9
|
+
'morph': 'https://portal.sina.birzeit.edu/ALMA27012000.pickle',
|
10
|
+
'ner': 'https://portal.sina.birzeit.edu/Wj27012000.tar.gz',
|
11
|
+
'salma_model': 'https://portal.sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
|
12
|
+
'salma_tokenizer': 'https://portal.sina.birzeit.edu/bert-base-arabertv02.zip',
|
13
|
+
'glosses_dic': 'https://portal.sina.birzeit.edu/glosses_dic.pickle',
|
14
|
+
'lemma_dic': 'https://portal.sina.birzeit.edu/lemmas_dic.pickle',
|
15
|
+
'five_grams': 'https://portal.sina.birzeit.edu/five_grams.pickle',
|
16
|
+
'four_grams':'https://portal.sina.birzeit.edu/four_grams.pickle',
|
17
|
+
'three_grams':'https://portal.sina.birzeit.edu/three_grams.pickle',
|
18
|
+
'two_grams':'https://portal.sina.birzeit.edu/two_grams.pickle'
|
19
|
+
}
|
20
|
+
|
21
|
+
def get_appdatadir():
|
22
|
+
"""
|
23
|
+
This method checks if the directory exists and creates if it doesn't. And returns the path to the directory where the application data is stored.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
--------
|
27
|
+
Path: A pathlib.Path object representing the path to the application data directory.
|
28
|
+
|
29
|
+
Raises:
|
30
|
+
-------
|
31
|
+
None.
|
32
|
+
|
33
|
+
**Example:**
|
34
|
+
|
35
|
+
.. highlight:: python
|
36
|
+
.. code-block:: python
|
37
|
+
|
38
|
+
from nlptools.DataDownload import downloader
|
39
|
+
|
40
|
+
path = downloader.get_appdatadir()
|
41
|
+
|
42
|
+
Windows: 'C:/Users/<Username>/AppData/Roaming/nlptools'
|
43
|
+
MacOS: '/Users/<Username>/Library/Application Support/nlptools'
|
44
|
+
Linux: '/home/<Username>/.nlptools'
|
45
|
+
Google Colab: '/content/nlptools'
|
46
|
+
|
47
|
+
"""
|
48
|
+
home = str(Path.home())
|
49
|
+
if 'google.colab' in sys.modules:
|
50
|
+
path = Path('/content/nlptools')
|
51
|
+
elif sys.platform == 'win32':
|
52
|
+
path = Path(home, 'AppData/Roaming/nlptools')
|
53
|
+
elif sys.platform == 'darwin':
|
54
|
+
path = Path(home, 'Library/Application Support/nlptools')
|
55
|
+
else:
|
56
|
+
path = Path(home, '.nlptools')
|
57
|
+
|
58
|
+
if not os.path.exists(path):
|
59
|
+
os.makedirs(path)
|
60
|
+
|
61
|
+
return path
|
62
|
+
|
63
|
+
def download_file(url='https://portal.sina.birzeit.edu/Wj27012000.tar.gz', dest_path=get_appdatadir()):
|
64
|
+
"""
|
65
|
+
Downloads a file from the specified URL and saves it to the specified destination path.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
url (:obj:`str`): The URL of the file to be downloaded.
|
69
|
+
dest_path (:obj:`str`): The destination path to save the downloaded file to. Defaults
|
70
|
+
to the user's application data directory.
|
71
|
+
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
:obj:`str`: The absolute path of the downloaded file.
|
75
|
+
|
76
|
+
Raises:
|
77
|
+
requests.exceptions.HTTPError: If there was an HTTP error during the request.
|
78
|
+
|
79
|
+
Note:
|
80
|
+
This method uses the `requests` and `tqdm` libraries. It also checks if the
|
81
|
+
compressed downloaded file type and extracts it.
|
82
|
+
|
83
|
+
**Example:**
|
84
|
+
|
85
|
+
.. highlight:: python
|
86
|
+
.. code-block:: python
|
87
|
+
|
88
|
+
download_file(url='https://example.com/data.zip', dest_path='data/')
|
89
|
+
|
90
|
+
"""
|
91
|
+
filename = os.path.basename(url)
|
92
|
+
file_path = os.path.join(dest_path, filename)
|
93
|
+
|
94
|
+
print(filename)
|
95
|
+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
96
|
+
|
97
|
+
try:
|
98
|
+
with requests.get(url, headers=headers, stream=True) as r:
|
99
|
+
r.raise_for_status()
|
100
|
+
with open(file_path, 'wb') as f:
|
101
|
+
total_size = int(r.headers.get('content-length', 0))
|
102
|
+
block_size = 8192
|
103
|
+
progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
104
|
+
for chunk in r.iter_content(chunk_size=block_size):
|
105
|
+
if chunk:
|
106
|
+
f.write(chunk)
|
107
|
+
progress_bar.update(len(chunk))
|
108
|
+
progress_bar.close()
|
109
|
+
|
110
|
+
# Check the file type and extract accordingly
|
111
|
+
file_extension = os.path.splitext(file_path)[1]
|
112
|
+
extracted_folder_name = os.path.splitext(file_path)[0]
|
113
|
+
|
114
|
+
if file_extension == '.zip':
|
115
|
+
extract_zip(file_path, extracted_folder_name)
|
116
|
+
elif file_extension == '.gz':
|
117
|
+
|
118
|
+
extract_tar(file_path, extracted_folder_name)
|
119
|
+
elif file_extension =='.pickle':
|
120
|
+
print(f'Done: {file_extension}')
|
121
|
+
|
122
|
+
else:
|
123
|
+
print(f'Unsupported file type for extraction: {file_extension}')
|
124
|
+
|
125
|
+
return file_path
|
126
|
+
|
127
|
+
except requests.exceptions.HTTPError as e:
|
128
|
+
if e.response.status_code == 403:
|
129
|
+
print(f'Error 403: Forbidden. The requested file URL {url} could not be downloaded due to insufficient permissions. Please check the URL and try again.')
|
130
|
+
else:
|
131
|
+
print('An error occurred while downloading the file:', e)
|
132
|
+
|
133
|
+
def extract_zip(file_path, extracted_folder_name):
|
134
|
+
"""
|
135
|
+
Extracts the contents of a ZIP file to the specified folder.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
file_path (str): The path to the ZIP file.
|
139
|
+
extracted_folder_name (str): The name of the folder where the contents will be extracted.
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
None
|
143
|
+
"""
|
144
|
+
with zipfile.ZipFile(file_path, 'r') as zip_file:
|
145
|
+
zip_file.extractall(extracted_folder_name)
|
146
|
+
|
147
|
+
|
148
|
+
def extract_tar(file_path, dest_path):
|
149
|
+
"""
|
150
|
+
Extracts the contents of a tar.gz file to the specified destination path.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
file_path (str): The path to the tar.gz file.
|
154
|
+
dest_path (str): The destination path where the contents will be extracted.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
str: The path to the extracted folder if successful, or None if extraction failed.
|
158
|
+
"""
|
159
|
+
try:
|
160
|
+
with tarfile.open(file_path, 'r:gz') as tar:
|
161
|
+
# Remove the extension from the file name
|
162
|
+
extracted_folder_name = os.path.splitext(os.path.basename(file_path))[0]
|
163
|
+
extracted_folder_path = os.path.join(dest_path, extracted_folder_name)
|
164
|
+
|
165
|
+
# Extract the contents to the destination path
|
166
|
+
tar.extractall(dest_path)
|
167
|
+
|
168
|
+
# Remove the compressed file
|
169
|
+
os.remove(file_path)
|
170
|
+
|
171
|
+
return extracted_folder_path
|
172
|
+
|
173
|
+
except tarfile.ReadError:
|
174
|
+
print(f'Failed to extract the file: {file_path}')
|
175
|
+
return None
|
176
|
+
|
177
|
+
|
178
|
+
def download_files():
|
179
|
+
"""
|
180
|
+
Downloads multiple files from a dictionary of URLs using the download_file() function.
|
181
|
+
Returns:
|
182
|
+
None
|
183
|
+
"""
|
184
|
+
for url in urls.values():
|
185
|
+
download_file(url)
|
nlptools/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
nlptools/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
# coding=utf-8
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
# Copyright 2018 The Google AI Language Team Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|