SinaTools 0.1.41__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  5. sinatools/ner/trainers/BertTrainer.py +163 -163
  6. sinatools/ner/trainers/__init__.py +2 -2
  7. SinaTools-0.1.41.dist-info/RECORD +0 -123
  8. sinatools/arabert/arabert/__init__.py +0 -14
  9. sinatools/arabert/arabert/create_classification_data.py +0 -260
  10. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  11. sinatools/arabert/arabert/extract_features.py +0 -444
  12. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  13. sinatools/arabert/arabert/modeling.py +0 -1027
  14. sinatools/arabert/arabert/optimization.py +0 -202
  15. sinatools/arabert/arabert/run_classifier.py +0 -1078
  16. sinatools/arabert/arabert/run_pretraining.py +0 -593
  17. sinatools/arabert/arabert/run_squad.py +0 -1440
  18. sinatools/arabert/arabert/tokenization.py +0 -414
  19. sinatools/arabert/araelectra/__init__.py +0 -1
  20. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  21. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  22. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  23. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  24. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  25. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  26. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  27. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  28. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  29. sinatools/arabert/araelectra/finetune/task.py +0 -74
  30. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  31. sinatools/arabert/araelectra/flops_computation.py +0 -215
  32. sinatools/arabert/araelectra/model/__init__.py +0 -14
  33. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  34. sinatools/arabert/araelectra/model/optimization.py +0 -193
  35. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  36. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  37. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  38. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  39. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  40. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  41. sinatools/arabert/araelectra/util/__init__.py +0 -14
  42. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  43. sinatools/arabert/araelectra/util/utils.py +0 -109
  44. sinatools/arabert/aragpt2/__init__.py +0 -2
  45. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  46. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  47. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  48. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  49. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  50. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  51. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  52. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  53. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  54. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  55. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  56. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  57. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  58. {SinaTools-0.1.41.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  59. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  60. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  61. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  62. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  63. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,59 +0,0 @@
1
- import argparse
2
- import glob
3
- import os
4
- from tokenizers import ByteLevelBPETokenizer
5
- from transformers import GPT2TokenizerFast
6
-
7
-
8
- if __name__ == "__main__":
9
- parser = argparse.ArgumentParser()
10
- parser.add_argument("--data-files", type=str, required=True)
11
- parser.add_argument("--vocab-size", type=int, required=True)
12
- parser.add_argument("--output-dir", type=str, required=True)
13
- parser.add_argument("--output-file-name", type=str, required=True)
14
- args = parser.parse_args()
15
-
16
- gpt2_tok = ByteLevelBPETokenizer(add_prefix_space=True)
17
-
18
- files = glob.glob(args.data_files)
19
- if len(files) > 10:
20
- print(files[0:10])
21
- else:
22
- print(files)
23
-
24
- gpt2_tok.train(
25
- files=files,
26
- vocab_size=args.vocab_size,
27
- show_progress=True,
28
- special_tokens=["<|endoftext|>", "<s>", "<pad>", "</s>"],
29
- )
30
-
31
- if not os.path.exists(args.output_dir):
32
- os.makedirs(args.output_dir)
33
-
34
-
35
- gpt2_tok.save(
36
- os.path.join(args.output_dir,"tokenizer.json"), pretty=True
37
- ) # FIX Access is denied. (os error 5)
38
- gpt2_tok.save_model(args.output_dir, args.output_file_name)
39
-
40
- # tokenizer = GPT2TokenizerFast(
41
- # vocab_file=os.path.join(args.output_dir, args.output_file_name) + "-vocab.json",
42
- # merges_file=os.path.join(args.output_dir, args.output_file_name)
43
- # + "-merges.txt",
44
- # add_prefix_space=True,
45
- # )
46
-
47
- # tokenizer.add_special_tokens(
48
- # {
49
- # "eos_token": "<|endoftext|>",
50
- # "bos_token": "<|endoftext|>",
51
- # "unk_token": "<|endoftext|>",
52
- # "pad_token": "<|endoftext|>",
53
- # "mask_token": "<|endoftext|>",
54
- # }
55
- # )
56
-
57
- # tokenizer.save_pretrained(
58
- # args.output_dir, legacy_format=False, filename_prefix=args.output_file_name
59
- # )