rababa 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/python.yml +81 -0
  3. data/.github/workflows/release.yml +36 -0
  4. data/.github/workflows/ruby.yml +27 -0
  5. data/.gitignore +3 -0
  6. data/.rubocop.yml +1 -1
  7. data/CODE_OF_CONDUCT.md +13 -13
  8. data/README.adoc +80 -0
  9. data/Rakefile +1 -1
  10. data/docs/{research-arabic-diacritization-06-2021.md → research-arabic-diacritization-06-2021.adoc} +52 -37
  11. data/exe/rababa +1 -1
  12. data/lib/README.adoc +95 -0
  13. data/lib/rababa/diacritizer.rb +16 -8
  14. data/lib/rababa/encoders.rb +2 -2
  15. data/lib/rababa/harakats.rb +1 -1
  16. data/lib/rababa/reconcile.rb +1 -33
  17. data/lib/rababa/version.rb +1 -1
  18. data/models-data/README.adoc +6 -0
  19. data/python/README.adoc +211 -0
  20. data/python/config/cbhg.yml +1 -1
  21. data/python/config/test_cbhg.yml +51 -0
  22. data/python/dataset.py +23 -31
  23. data/python/diacritization_model_to_onnx.py +216 -15
  24. data/python/diacritizer.py +35 -31
  25. data/python/log_dir/CA_MSA.base.cbhg/models/README.adoc +2 -0
  26. data/python/log_dir/README.adoc +1 -0
  27. data/python/{requirement.txt → requirements.txt} +1 -1
  28. data/python/setup.py +32 -0
  29. data/python/trainer.py +10 -4
  30. data/python/util/reconcile_original_plus_diacritized.py +2 -0
  31. data/python/util/text_cleaners.py +59 -4
  32. data/rababa.gemspec +1 -1
  33. data/test-datasets/data-arabic-pointing/{Readme.md → README.adoc} +2 -1
  34. metadata +22 -18
  35. data/.github/workflows/main.yml +0 -18
  36. data/README.md +0 -73
  37. data/lib/README.md +0 -82
  38. data/models-data/README.md +0 -6
  39. data/python/README.md +0 -163
  40. data/python/log_dir/CA_MSA.base.cbhg/models/Readme.md +0 -2
  41. data/python/log_dir/README.md +0 -1
@@ -0,0 +1,2 @@
1
+ ==== Put model trained with CA_MSA here:
2
+ 2000000-snapshot.pt
@@ -0,0 +1 @@
1
+ === Model storage directory for training and inference
@@ -1,4 +1,4 @@
1
- torch==1.7.0
1
+ torch==1.9.0
2
2
  numpy==1.19.5
3
3
  matplotlib==3.3.3
4
4
  pandas==1.1.5
data/python/setup.py ADDED
@@ -0,0 +1,32 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='rababa',
5
+ version='0.1.0',
6
+ description='Rababa for Arabic diacriticization',
7
+ author='Ribose',
8
+ author_email='open.source@ribose.com',
9
+ url='https://www.interscript.org',
10
+ # packages=find_packages(include=['exampleproject', 'exampleproject.*']),
11
+ python_requires='>=3.6, <4',
12
+ install_requires=[
13
+ 'torch==1.9.0',
14
+ 'numpy==1.19.5',
15
+ 'matplotlib==3.3.3',
16
+ 'pandas==1.1.5',
17
+ 'ruamel.yaml==0.16.12',
18
+ 'tensorboard==2.4.0',
19
+ 'diacritization-evaluation==0.5',
20
+ 'tqdm==4.56.0',
21
+ 'onnx==1.9.0',
22
+ 'onnxruntime==1.8.1',
23
+ 'pyyaml==5.4.1',
24
+ ],
25
+ # extras_require={'plotting': ['matplotlib>=2.2.0', 'jupyter']},
26
+ setup_requires=['pytest-runner'],
27
+ tests_require=['pytest'],
28
+ # entry_points={
29
+ # 'console_scripts': ['my-command=exampleproject.example:main']
30
+ # },
31
+ # package_data={'exampleproject': ['data/schema.json']}
32
+ )
data/python/trainer.py CHANGED
@@ -12,7 +12,7 @@ from tqdm import trange
12
12
 
13
13
  from config_manager import ConfigManager
14
14
  from dataset import load_iterators
15
- from diacritizer import CBHGDiacritizer
15
+ from diacritizer import Diacritizer
16
16
  from util.learning_rates import LearningRateDecay
17
17
  from options import OptimizerType
18
18
  from util.utils import (
@@ -51,6 +51,7 @@ class GeneralTrainer(Trainer):
51
51
  self.model = self.config_manager.get_model()
52
52
 
53
53
  self.optimizer = self.get_optimizer()
54
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
54
55
  self.model = self.model.to(self.device)
55
56
 
56
57
  self.load_model(model_path=self.config.get("train_resume_model_path"))
@@ -78,7 +79,7 @@ class GeneralTrainer(Trainer):
78
79
 
79
80
  def load_diacritizer(self):
80
81
  if self.model_kind in ["cbhg", "baseline"]:
81
- self.diacritizer = CBHGDiacritizer(self.config_path, self.model_kind)
82
+ self.diacritizer = Diacritizer(self.config_path, self.model_kind)
82
83
  else:
83
84
  print('model not found')
84
85
  exit()
@@ -195,6 +196,7 @@ class GeneralTrainer(Trainer):
195
196
  return results, summary_texts
196
197
 
197
198
  def run(self):
199
+
198
200
  scaler = torch.cuda.amp.GradScaler()
199
201
  train_iterator, _, validation_iterator = load_iterators(self.config_manager)
200
202
  print("data loaded")
@@ -337,9 +339,12 @@ class GeneralTrainer(Trainer):
337
339
 
338
340
  predictions = outputs["diacritics"].contiguous()
339
341
  targets = batch_inputs["target"].contiguous()
342
+
340
343
  predictions = predictions.view(-1, predictions.shape[-1])
341
344
  targets = targets.view(-1)
342
- loss = self.criterion(predictions.to(self.device), targets.to(self.device))
345
+
346
+ loss = self.criterion(predictions.to(self.device),
347
+ targets.to(self.device))
343
348
  outputs.update({"loss": loss})
344
349
  return outputs
345
350
 
@@ -361,7 +366,8 @@ class GeneralTrainer(Trainer):
361
366
  last_model_path = model_path
362
367
 
363
368
  print(f"loading from {last_model_path}")
364
- saved_model = torch.load(last_model_path)
369
+ saved_model = torch.load(last_model_path) if torch.cuda.is_available() \
370
+ else torch.load(last_model_path, map_location=torch.device('cpu'))
365
371
  self.model.load_state_dict(saved_model["model_state_dict"])
366
372
  if load_optimizer:
367
373
  self.optimizer.load_state_dict(saved_model["optimizer_state_dict"])
@@ -26,6 +26,7 @@ def build_pivot_map(d_original, d_diacritized):
26
26
  d_diacritized: dictionary modelling diacritized as above
27
27
  return: list of ids tuple where strings match
28
28
  """
29
+
29
30
  l_map = []
30
31
  idx_dia, idx_ori = 0, 0
31
32
  while idx_dia < len(d_diacritized):
@@ -59,6 +60,7 @@ def reconcile_strings(str_original, str_diacritized):
59
60
  str_diacritized: diacritized string
60
61
  return: reconciled string
61
62
  """
63
+
62
64
  # we model the strings as dict
63
65
  d_original = dict((i,c) for i,c in
64
66
  enumerate(list([c for c in str_original if not c in HARAQAT])))
@@ -1,6 +1,6 @@
1
1
  import re
2
- from util.constants import VALID_ARABIC
3
-
2
+ from util.constants import VALID_ARABIC, BASIC_HARAQAT, ALL_POSSIBLE_HARAQAT
3
+ from diacritization_evaluation import util
4
4
 
5
5
  _whitespace_re = re.compile(r"\s+")
6
6
 
@@ -9,13 +9,68 @@ def collapse_whitespace(text):
9
9
  text = re.sub(_whitespace_re, " ", text)
10
10
  return text
11
11
 
12
-
13
12
  def basic_cleaners(text):
14
13
  text = collapse_whitespace(text)
15
14
  return text.strip()
16
15
 
17
-
18
16
  def valid_arabic_cleaners(text):
19
17
  text = filter(lambda char: char in VALID_ARABIC, text)
20
18
  text = collapse_whitespace(''.join(list(text)))
21
19
  return text.strip()
20
+
21
+ def extract_stack(stack, correct_reversed: bool = True):
22
+ """
23
+ Given stack, we extract its content to string, and check whether this string is
24
+ available at all_possible_haraqat list: if not we raise an error. When correct_reversed
25
+ is set, we also check the reversed order of the string, if it was not already correct.
26
+ """
27
+ char_haraqat = []
28
+ while len(stack) != 0:
29
+ char_haraqat.append(stack.pop())
30
+ full_haraqah = "".join(char_haraqat)
31
+ reversed_full_haraqah = "".join(reversed(char_haraqat))
32
+ if full_haraqah in ALL_POSSIBLE_HARAQAT:
33
+ out = full_haraqah
34
+ elif reversed_full_haraqah in ALL_POSSIBLE_HARAQAT and correct_reversed:
35
+ out = reversed_full_haraqah
36
+ else:
37
+ #raise ValueError(stack)
38
+
39
+ #raise ValueError(
40
+ # f"""The chart has the following haraqat which are not found in
41
+ #all possible haraqat: {'|'.join([ALL_POSSIBLE_HARAQAT[diacritic]
42
+ # for diacritic in full_haraqah ])}"""
43
+ #)
44
+ out = ''
45
+ return out
46
+
47
+ def extract_haraqat(text: str, correct_reversed: bool = True):
48
+ """
49
+ Args:
50
+ text (str): text to be diacritized
51
+ Returns:
52
+ text: the original text as it comes
53
+ text_list: all text that are not haraqat
54
+ haraqat_list: all haraqat_list
55
+ """
56
+ if len(text.strip()) == 0:
57
+ return text, [" "] * len(text), [""] * len(text)
58
+ stack = []
59
+ haraqat_list = []
60
+ txt_list = []
61
+ for char in text:
62
+ # if chart is a diacritic, then extract the stack and empty it
63
+ if char not in BASIC_HARAQAT.keys():
64
+ stack_content = extract_stack(stack,
65
+ correct_reversed=correct_reversed)
66
+ #if stack_content != '':
67
+ haraqat_list.append(stack_content)
68
+ txt_list.append(char)
69
+ stack = []
70
+ else:
71
+ stack.append(char)
72
+ if len(haraqat_list) > 0:
73
+ del haraqat_list[0]
74
+ haraqat_list.append(extract_stack(stack))
75
+
76
+ return text, txt_list, haraqat_list
data/rababa.gemspec CHANGED
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
  spec.summary = "Arabic diacriticizer from Interscript."
12
12
  # spec.description = "TODO: Write a longer description or delete this line."
13
13
  spec.homepage = "https://www.interscript.org"
14
- spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
14
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
15
15
 
16
16
  spec.metadata["homepage_uri"] = spec.homepage
17
17
  spec.metadata["source_code_uri"] = "https://github.com/interscript/rababa"
@@ -1,2 +1,3 @@
1
- # Data arabic pointing:
1
+ = Data arabic pointing
2
+
2
3
  https://github.com/secryst/data-arabic-pointing
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rababa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-07-26 00:00:00.000000000 Z
11
+ date: 2021-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: onnxruntime
@@ -66,7 +66,7 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- description:
69
+ description:
70
70
  email:
71
71
  - open.source@ribose.com
72
72
  executables:
@@ -74,21 +74,23 @@ executables:
74
74
  extensions: []
75
75
  extra_rdoc_files: []
76
76
  files:
77
- - ".github/workflows/main.yml"
77
+ - ".github/workflows/python.yml"
78
+ - ".github/workflows/release.yml"
79
+ - ".github/workflows/ruby.yml"
78
80
  - ".gitignore"
79
81
  - ".rspec"
80
82
  - ".rubocop.yml"
81
83
  - CODE_OF_CONDUCT.md
82
84
  - Gemfile
83
- - README.md
85
+ - README.adoc
84
86
  - Rakefile
85
87
  - bin/console
86
88
  - bin/setup
87
89
  - config/model.yml
88
90
  - data/example.txt
89
- - docs/research-arabic-diacritization-06-2021.md
91
+ - docs/research-arabic-diacritization-06-2021.adoc
90
92
  - exe/rababa
91
- - lib/README.md
93
+ - lib/README.adoc
92
94
  - lib/rababa.rb
93
95
  - lib/rababa/arabic_constants.rb
94
96
  - lib/rababa/diacritizer.rb
@@ -96,18 +98,19 @@ files:
96
98
  - lib/rababa/harakats.rb
97
99
  - lib/rababa/reconcile.rb
98
100
  - lib/rababa/version.rb
99
- - models-data/README.md
101
+ - models-data/README.adoc
100
102
  - models-data/batch_example_data.pkl
101
- - python/README.md
103
+ - python/README.adoc
102
104
  - python/config/baseline.yml
103
105
  - python/config/cbhg.yml
106
+ - python/config/test_cbhg.yml
104
107
  - python/config_manager.py
105
108
  - python/dataset.py
106
109
  - python/diacritization_model_to_onnx.py
107
110
  - python/diacritize.py
108
111
  - python/diacritizer.py
109
- - python/log_dir/CA_MSA.base.cbhg/models/Readme.md
110
- - python/log_dir/README.md
112
+ - python/log_dir/CA_MSA.base.cbhg/models/README.adoc
113
+ - python/log_dir/README.adoc
111
114
  - python/models/baseline.py
112
115
  - python/models/cbhg.py
113
116
  - python/models/seq2seq.py
@@ -116,7 +119,8 @@ files:
116
119
  - python/modules/layers.py
117
120
  - python/modules/tacotron_modules.py
118
121
  - python/options.py
119
- - python/requirement.txt
122
+ - python/requirements.txt
123
+ - python/setup.py
120
124
  - python/test.py
121
125
  - python/tester.py
122
126
  - python/train.py
@@ -130,7 +134,7 @@ files:
130
134
  - python/util/utils.py
131
135
  - rababa.gemspec
132
136
  - test-datasets/business-cases/examples_with_coutrynames.txt
133
- - test-datasets/data-arabic-pointing/Readme.md
137
+ - test-datasets/data-arabic-pointing/README.adoc
134
138
  - test-datasets/tashkeela/test.txt
135
139
  - test-datasets/tashkeela/train.txt
136
140
  - test-datasets/tashkeela/val.txt
@@ -140,7 +144,7 @@ metadata:
140
144
  homepage_uri: https://www.interscript.org
141
145
  source_code_uri: https://github.com/interscript/rababa
142
146
  changelog_uri: https://github.com/interscript/rababa
143
- post_install_message:
147
+ post_install_message:
144
148
  rdoc_options: []
145
149
  require_paths:
146
150
  - lib
@@ -148,15 +152,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
148
152
  requirements:
149
153
  - - ">="
150
154
  - !ruby/object:Gem::Version
151
- version: 2.4.0
155
+ version: 2.5.0
152
156
  required_rubygems_version: !ruby/object:Gem::Requirement
153
157
  requirements:
154
158
  - - ">="
155
159
  - !ruby/object:Gem::Version
156
160
  version: '0'
157
161
  requirements: []
158
- rubygems_version: 3.0.3
159
- signing_key:
162
+ rubygems_version: 3.1.6
163
+ signing_key:
160
164
  specification_version: 4
161
165
  summary: Arabic diacriticizer from Interscript.
162
166
  test_files: []
@@ -1,18 +0,0 @@
1
- name: Ruby
2
-
3
- on: [push,pull_request]
4
-
5
- jobs:
6
- build:
7
- runs-on: ubuntu-latest
8
- steps:
9
- - uses: actions/checkout@v2
10
- - name: Set up Ruby
11
- uses: ruby/setup-ruby@v1
12
- with:
13
- ruby-version: 2.6.6
14
- - name: Run the default task
15
- run: |
16
- gem install bundler -v 2.2.15
17
- bundle install
18
- bundle exec rake
data/README.md DELETED
@@ -1,73 +0,0 @@
1
- # رُبابَة RABABA the Arabic Diacritization Library
2
-
3
- Arabic diacritization is useful for several practical business cases like text
4
- to speech or Romanization of Arabic texts or scripts.
5
-
6
- ## Purpose
7
-
8
- This repository contains everything to train a diacritization model in Python
9
- and run it in Python and Ruby.
10
-
11
- ## Try out Rababa
12
-
13
- Rababa can be run both in python and ruby. Go the directory corresponding to the language you prefer to use. Indications are in the README's, under the "Try out Rababa" section:
14
- * [Python](https://github.com/interscript/rababa/tree/master/python)
15
- * [Ruby](https://github.com/interscript/rababa/tree/master/lib)
16
-
17
- ## Library
18
-
19
- This library was built for the
20
- [Interscript project](https://www.interscript.org)
21
- ([at GitHub](https://github.com/interscript/)).
22
-
23
- Diacritization strategy is following several steps with at heart a deep learning
24
- model:
25
-
26
- 1. text preprocessing
27
- 2. neural networks model prediction
28
- 3. text postprocessing
29
-
30
- This repository contains:
31
-
32
- - [lib](https://github.com/interscript/rababa/tree/master/lib) is
33
- the Ruby library using NNet model in ONNX format.
34
-
35
- - [docs](https://github.com/interscript/rababa/tree/master/docs)
36
- contains an application focused summary of latest (2021-06) relevant papers
37
- and solutions.
38
-
39
- - [python](https://github.com/interscript/rababa/tree/master/python)
40
- - A **neural network solution** for automatised diacritization based on the
41
- work of [almodhfer](https://github.com/almodhfer/Arabic_Diacritization),
42
- from which we overtook the baseline and more advanced and efficient CBHG
43
- models only. This very recent solution allows for efficient predictions on
44
- CPU's with a reasonable sized model.
45
-
46
- * **PyTorch to ONNX** conversion of PyTorch to ONNX format
47
-
48
- * **Strings Pre-/Post-processing**, also from
49
- [almodhfer](https://github.com/almodhfer/Arabic_Diacritization)
50
-
51
- - [tests and benchmarking utilities](https://github.com/interscript/rababa/tree/master/tests-benchmarks),
52
- allowing to compare with other implementations.
53
-
54
- * tests are are taken from
55
- [diacritization benchmarking](https://github.com/AliOsm/arabic-text-diacritization)
56
-
57
- * we have added own, realistic datasets for the problem of diacritization
58
-
59
- - **models-data** directory to store models and embeddings in various formats
60
-
61
- ## About the Name
62
-
63
- A https://en.wikipedia.org/wiki/Rebab[Rababa] is an antique string instrument.
64
-
65
- In a similar fashion that a Rababa produces melody from a simple strings and
66
- pieces of wood, our library and diacritization gives a whole palette of colour
67
- and meanings to arabic scripts.
68
-
69
- ## Under development
70
-
71
- We are working on the following improvements:
72
- * Preprocessing for breaking down large sentences
73
- * PoS tagging and search to improve the diacritization
data/lib/README.md DELETED
@@ -1,82 +0,0 @@
1
- # Arabic Diacritization in Ruby with Rababa
2
-
3
- ## Try out Rababa
4
-
5
- * Install the Gems listed below
6
- * Download a ruby model on [releases](https://github.com/secryst/rababa-models)
7
-
8
- ### Run examples
9
-
10
- Prerequisite:
11
-
12
- * Please download the `diacritization_model_max_len_200.onnx` model file
13
- from https://github.com/secryst/rababa-models/releases/tag/0.1
14
-
15
- One can diacritize either single strings:
16
-
17
- ```sh
18
- rababa -t 'قطر' -m diacritization_model_max_len_200.onnx
19
- # or when inside the gem directory during development
20
- bundle exec exe/rababa -t 'قطر' -m diacritization_model_max_len_200.onnx
21
- ```
22
-
23
- Or files as `data/examples.txt` or your own Arabic file (the max string length
24
- is specified in the model and has to match the `max_len` parameter in
25
- `config/models.yaml`):
26
-
27
- ```sh
28
- rababa -f data/example.txt -m diacritization_model_max_len_200.onnx
29
- # or when inside the gem directory during development
30
- bundle exec exe/rababa -f data/example.txt -m diacritization_model_max_len_200.onnx
31
- ```
32
-
33
- One would have to preprocess generic arabic texts for running Rababa in general.
34
- This can be done on sentences beginnings running for instance
35
- [Hamza5](https://github.com/Hamza5/Pipeline-diacritizer):
36
-
37
- ```
38
- python __main__.py preprocess source destination
39
- ```
40
-
41
- ### ONNX Models
42
-
43
- They can either be built in the `/python` repository or downloaded from the
44
- [releases](https://github.com/secryst/rababa-models).
45
-
46
- Or ONNX model can be generated running the python
47
- [code](https://github.com/interscript/rababa/blob/master/python/diacritization_model_to_onnx.py)
48
- in this library.
49
-
50
- It requires to go through some of the steps described in the link above.
51
-
52
- ### Parameters
53
-
54
- * text to diacritize: "**-t**TEXT", "--text=TEXT",
55
- * path to file to diacritize: "**-f**FILE", "--text_filename=FILE",
56
- * path to ONNX model **Mandatory**: "-mMODEL", "--model_file=MODEL",
57
- * path to config file **Default:config/model.yml**: "-cCONFIG", "--config=CONFIG"
58
-
59
- ### Config
60
-
61
- #### Players:
62
-
63
- * max_len: 200 -- 600
64
- * Parameter that has to match the ONNX model built using the
65
- [code]{https://github.com/interscript/rababa/blob/master/python/diacritization_model_to_onnx.py}
66
- and following the python/Readme.md.
67
- * Longer sentences will need to be preprocessed, which can be done for
68
- instance using [Hamza5](https://github.com/Hamza5)
69
- [code](https://github.com/Hamza5/Pipeline-diacritizer/blob/master/pipeline_diacritizer/pipeline_diacritizer.py).
70
- * the smaller the faster the nnets code.
71
- * text_encoder corresponding to the [rules](https://github.com/interscript/rababa/blob/master/python/util/text_encoders.py):
72
- * BasicArabicEncoder
73
- * ArabicEncoderWithStartSymbol
74
- * text_cleaner corresponding to [logics](https://github.com/interscript/rababa/blob/master/python/util/text_cleaners.py):
75
- * basic_cleaners: remove redundancy in whitespaces and strip string
76
- * valid_arabic_cleaners: basic+filter of only arabic words
77
-
78
- ### Gems
79
-
80
- ```sh
81
- gem install rababa
82
- ```