SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +5 -7
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.40.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,59 +0,0 @@
|
|
1
|
-
import argparse
|
2
|
-
import glob
|
3
|
-
import os
|
4
|
-
from tokenizers import ByteLevelBPETokenizer
|
5
|
-
from transformers import GPT2TokenizerFast
|
6
|
-
|
7
|
-
|
8
|
-
if __name__ == "__main__":
|
9
|
-
parser = argparse.ArgumentParser()
|
10
|
-
parser.add_argument("--data-files", type=str, required=True)
|
11
|
-
parser.add_argument("--vocab-size", type=int, required=True)
|
12
|
-
parser.add_argument("--output-dir", type=str, required=True)
|
13
|
-
parser.add_argument("--output-file-name", type=str, required=True)
|
14
|
-
args = parser.parse_args()
|
15
|
-
|
16
|
-
gpt2_tok = ByteLevelBPETokenizer(add_prefix_space=True)
|
17
|
-
|
18
|
-
files = glob.glob(args.data_files)
|
19
|
-
if len(files) > 10:
|
20
|
-
print(files[0:10])
|
21
|
-
else:
|
22
|
-
print(files)
|
23
|
-
|
24
|
-
gpt2_tok.train(
|
25
|
-
files=files,
|
26
|
-
vocab_size=args.vocab_size,
|
27
|
-
show_progress=True,
|
28
|
-
special_tokens=["<|endoftext|>", "<s>", "<pad>", "</s>"],
|
29
|
-
)
|
30
|
-
|
31
|
-
if not os.path.exists(args.output_dir):
|
32
|
-
os.makedirs(args.output_dir)
|
33
|
-
|
34
|
-
|
35
|
-
gpt2_tok.save(
|
36
|
-
os.path.join(args.output_dir,"tokenizer.json"), pretty=True
|
37
|
-
) # FIX Access is denied. (os error 5)
|
38
|
-
gpt2_tok.save_model(args.output_dir, args.output_file_name)
|
39
|
-
|
40
|
-
# tokenizer = GPT2TokenizerFast(
|
41
|
-
# vocab_file=os.path.join(args.output_dir, args.output_file_name) + "-vocab.json",
|
42
|
-
# merges_file=os.path.join(args.output_dir, args.output_file_name)
|
43
|
-
# + "-merges.txt",
|
44
|
-
# add_prefix_space=True,
|
45
|
-
# )
|
46
|
-
|
47
|
-
# tokenizer.add_special_tokens(
|
48
|
-
# {
|
49
|
-
# "eos_token": "<|endoftext|>",
|
50
|
-
# "bos_token": "<|endoftext|>",
|
51
|
-
# "unk_token": "<|endoftext|>",
|
52
|
-
# "pad_token": "<|endoftext|>",
|
53
|
-
# "mask_token": "<|endoftext|>",
|
54
|
-
# }
|
55
|
-
# )
|
56
|
-
|
57
|
-
# tokenizer.save_pretrained(
|
58
|
-
# args.output_dir, legacy_format=False, filename_prefix=args.output_file_name
|
59
|
-
# )
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|