rababa 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/python.yml +81 -0
- data/.github/workflows/release.yml +36 -0
- data/.github/workflows/ruby.yml +27 -0
- data/.gitignore +3 -0
- data/.rubocop.yml +1 -1
- data/CODE_OF_CONDUCT.md +13 -13
- data/README.adoc +80 -0
- data/Rakefile +1 -1
- data/docs/{research-arabic-diacritization-06-2021.md → research-arabic-diacritization-06-2021.adoc} +52 -37
- data/exe/rababa +1 -1
- data/lib/README.adoc +95 -0
- data/lib/rababa/diacritizer.rb +16 -8
- data/lib/rababa/encoders.rb +2 -2
- data/lib/rababa/harakats.rb +1 -1
- data/lib/rababa/reconcile.rb +1 -33
- data/lib/rababa/version.rb +1 -1
- data/models-data/README.adoc +6 -0
- data/python/README.adoc +211 -0
- data/python/config/cbhg.yml +1 -1
- data/python/config/test_cbhg.yml +51 -0
- data/python/dataset.py +23 -31
- data/python/diacritization_model_to_onnx.py +216 -15
- data/python/diacritizer.py +35 -31
- data/python/log_dir/CA_MSA.base.cbhg/models/README.adoc +2 -0
- data/python/log_dir/README.adoc +1 -0
- data/python/{requirement.txt → requirements.txt} +1 -1
- data/python/setup.py +32 -0
- data/python/trainer.py +10 -4
- data/python/util/reconcile_original_plus_diacritized.py +2 -0
- data/python/util/text_cleaners.py +59 -4
- data/rababa.gemspec +1 -1
- data/test-datasets/data-arabic-pointing/{Readme.md → README.adoc} +2 -1
- metadata +22 -18
- data/.github/workflows/main.yml +0 -18
- data/README.md +0 -73
- data/lib/README.md +0 -82
- data/models-data/README.md +0 -6
- data/python/README.md +0 -163
- data/python/log_dir/CA_MSA.base.cbhg/models/Readme.md +0 -2
- data/python/log_dir/README.md +0 -1
@@ -0,0 +1 @@
|
|
1
|
+
=== Model storage directory for training and inference
|
data/python/setup.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
from setuptools import setup, find_packages
|
2
|
+
|
3
|
+
setup(
|
4
|
+
name='rababa',
|
5
|
+
version='0.1.0',
|
6
|
+
description='Rababa for Arabic diacriticization',
|
7
|
+
author='Ribose',
|
8
|
+
author_email='open.source@ribose.com',
|
9
|
+
url='https://www.interscript.org',
|
10
|
+
# packages=find_packages(include=['exampleproject', 'exampleproject.*']),
|
11
|
+
python_requires='>=3.6, <4',
|
12
|
+
install_requires=[
|
13
|
+
'torch==1.9.0',
|
14
|
+
'numpy==1.19.5',
|
15
|
+
'matplotlib==3.3.3',
|
16
|
+
'pandas==1.1.5',
|
17
|
+
'ruamel.yaml==0.16.12',
|
18
|
+
'tensorboard==2.4.0',
|
19
|
+
'diacritization-evaluation==0.5',
|
20
|
+
'tqdm==4.56.0',
|
21
|
+
'onnx==1.9.0',
|
22
|
+
'onnxruntime==1.8.1',
|
23
|
+
'pyyaml==5.4.1',
|
24
|
+
],
|
25
|
+
# extras_require={'plotting': ['matplotlib>=2.2.0', 'jupyter']},
|
26
|
+
setup_requires=['pytest-runner'],
|
27
|
+
tests_require=['pytest'],
|
28
|
+
# entry_points={
|
29
|
+
# 'console_scripts': ['my-command=exampleproject.example:main']
|
30
|
+
# },
|
31
|
+
# package_data={'exampleproject': ['data/schema.json']}
|
32
|
+
)
|
data/python/trainer.py
CHANGED
@@ -12,7 +12,7 @@ from tqdm import trange
|
|
12
12
|
|
13
13
|
from config_manager import ConfigManager
|
14
14
|
from dataset import load_iterators
|
15
|
-
from diacritizer import
|
15
|
+
from diacritizer import Diacritizer
|
16
16
|
from util.learning_rates import LearningRateDecay
|
17
17
|
from options import OptimizerType
|
18
18
|
from util.utils import (
|
@@ -51,6 +51,7 @@ class GeneralTrainer(Trainer):
|
|
51
51
|
self.model = self.config_manager.get_model()
|
52
52
|
|
53
53
|
self.optimizer = self.get_optimizer()
|
54
|
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
54
55
|
self.model = self.model.to(self.device)
|
55
56
|
|
56
57
|
self.load_model(model_path=self.config.get("train_resume_model_path"))
|
@@ -78,7 +79,7 @@ class GeneralTrainer(Trainer):
|
|
78
79
|
|
79
80
|
def load_diacritizer(self):
|
80
81
|
if self.model_kind in ["cbhg", "baseline"]:
|
81
|
-
self.diacritizer =
|
82
|
+
self.diacritizer = Diacritizer(self.config_path, self.model_kind)
|
82
83
|
else:
|
83
84
|
print('model not found')
|
84
85
|
exit()
|
@@ -195,6 +196,7 @@ class GeneralTrainer(Trainer):
|
|
195
196
|
return results, summary_texts
|
196
197
|
|
197
198
|
def run(self):
|
199
|
+
|
198
200
|
scaler = torch.cuda.amp.GradScaler()
|
199
201
|
train_iterator, _, validation_iterator = load_iterators(self.config_manager)
|
200
202
|
print("data loaded")
|
@@ -337,9 +339,12 @@ class GeneralTrainer(Trainer):
|
|
337
339
|
|
338
340
|
predictions = outputs["diacritics"].contiguous()
|
339
341
|
targets = batch_inputs["target"].contiguous()
|
342
|
+
|
340
343
|
predictions = predictions.view(-1, predictions.shape[-1])
|
341
344
|
targets = targets.view(-1)
|
342
|
-
|
345
|
+
|
346
|
+
loss = self.criterion(predictions.to(self.device),
|
347
|
+
targets.to(self.device))
|
343
348
|
outputs.update({"loss": loss})
|
344
349
|
return outputs
|
345
350
|
|
@@ -361,7 +366,8 @@ class GeneralTrainer(Trainer):
|
|
361
366
|
last_model_path = model_path
|
362
367
|
|
363
368
|
print(f"loading from {last_model_path}")
|
364
|
-
saved_model = torch.load(last_model_path)
|
369
|
+
saved_model = torch.load(last_model_path) if torch.cuda.is_available() \
|
370
|
+
else torch.load(last_model_path, map_location=torch.device('cpu'))
|
365
371
|
self.model.load_state_dict(saved_model["model_state_dict"])
|
366
372
|
if load_optimizer:
|
367
373
|
self.optimizer.load_state_dict(saved_model["optimizer_state_dict"])
|
@@ -26,6 +26,7 @@ def build_pivot_map(d_original, d_diacritized):
|
|
26
26
|
d_diacritized: dictionary modelling diacritized as above
|
27
27
|
return: list of ids tuple where strings match
|
28
28
|
"""
|
29
|
+
|
29
30
|
l_map = []
|
30
31
|
idx_dia, idx_ori = 0, 0
|
31
32
|
while idx_dia < len(d_diacritized):
|
@@ -59,6 +60,7 @@ def reconcile_strings(str_original, str_diacritized):
|
|
59
60
|
str_diacritized: diacritized string
|
60
61
|
return: reconciled string
|
61
62
|
"""
|
63
|
+
|
62
64
|
# we model the strings as dict
|
63
65
|
d_original = dict((i,c) for i,c in
|
64
66
|
enumerate(list([c for c in str_original if not c in HARAQAT])))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import re
|
2
|
-
from util.constants import VALID_ARABIC
|
3
|
-
|
2
|
+
from util.constants import VALID_ARABIC, BASIC_HARAQAT, ALL_POSSIBLE_HARAQAT
|
3
|
+
from diacritization_evaluation import util
|
4
4
|
|
5
5
|
_whitespace_re = re.compile(r"\s+")
|
6
6
|
|
@@ -9,13 +9,68 @@ def collapse_whitespace(text):
|
|
9
9
|
text = re.sub(_whitespace_re, " ", text)
|
10
10
|
return text
|
11
11
|
|
12
|
-
|
13
12
|
def basic_cleaners(text):
|
14
13
|
text = collapse_whitespace(text)
|
15
14
|
return text.strip()
|
16
15
|
|
17
|
-
|
18
16
|
def valid_arabic_cleaners(text):
|
19
17
|
text = filter(lambda char: char in VALID_ARABIC, text)
|
20
18
|
text = collapse_whitespace(''.join(list(text)))
|
21
19
|
return text.strip()
|
20
|
+
|
21
|
+
def extract_stack(stack, correct_reversed: bool = True):
|
22
|
+
"""
|
23
|
+
Given stack, we extract its content to string, and check whether this string is
|
24
|
+
available at all_possible_haraqat list: if not we raise an error. When correct_reversed
|
25
|
+
is set, we also check the reversed order of the string, if it was not already correct.
|
26
|
+
"""
|
27
|
+
char_haraqat = []
|
28
|
+
while len(stack) != 0:
|
29
|
+
char_haraqat.append(stack.pop())
|
30
|
+
full_haraqah = "".join(char_haraqat)
|
31
|
+
reversed_full_haraqah = "".join(reversed(char_haraqat))
|
32
|
+
if full_haraqah in ALL_POSSIBLE_HARAQAT:
|
33
|
+
out = full_haraqah
|
34
|
+
elif reversed_full_haraqah in ALL_POSSIBLE_HARAQAT and correct_reversed:
|
35
|
+
out = reversed_full_haraqah
|
36
|
+
else:
|
37
|
+
#raise ValueError(stack)
|
38
|
+
|
39
|
+
#raise ValueError(
|
40
|
+
# f"""The chart has the following haraqat which are not found in
|
41
|
+
#all possible haraqat: {'|'.join([ALL_POSSIBLE_HARAQAT[diacritic]
|
42
|
+
# for diacritic in full_haraqah ])}"""
|
43
|
+
#)
|
44
|
+
out = ''
|
45
|
+
return out
|
46
|
+
|
47
|
+
def extract_haraqat(text: str, correct_reversed: bool = True):
|
48
|
+
"""
|
49
|
+
Args:
|
50
|
+
text (str): text to be diacritized
|
51
|
+
Returns:
|
52
|
+
text: the original text as it comes
|
53
|
+
text_list: all text that are not haraqat
|
54
|
+
haraqat_list: all haraqat_list
|
55
|
+
"""
|
56
|
+
if len(text.strip()) == 0:
|
57
|
+
return text, [" "] * len(text), [""] * len(text)
|
58
|
+
stack = []
|
59
|
+
haraqat_list = []
|
60
|
+
txt_list = []
|
61
|
+
for char in text:
|
62
|
+
# if chart is a diacritic, then extract the stack and empty it
|
63
|
+
if char not in BASIC_HARAQAT.keys():
|
64
|
+
stack_content = extract_stack(stack,
|
65
|
+
correct_reversed=correct_reversed)
|
66
|
+
#if stack_content != '':
|
67
|
+
haraqat_list.append(stack_content)
|
68
|
+
txt_list.append(char)
|
69
|
+
stack = []
|
70
|
+
else:
|
71
|
+
stack.append(char)
|
72
|
+
if len(haraqat_list) > 0:
|
73
|
+
del haraqat_list[0]
|
74
|
+
haraqat_list.append(extract_stack(stack))
|
75
|
+
|
76
|
+
return text, txt_list, haraqat_list
|
data/rababa.gemspec
CHANGED
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
|
|
11
11
|
spec.summary = "Arabic diacriticizer from Interscript."
|
12
12
|
# spec.description = "TODO: Write a longer description or delete this line."
|
13
13
|
spec.homepage = "https://www.interscript.org"
|
14
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
14
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
|
15
15
|
|
16
16
|
spec.metadata["homepage_uri"] = spec.homepage
|
17
17
|
spec.metadata["source_code_uri"] = "https://github.com/interscript/rababa"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rababa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: onnxruntime
|
@@ -66,7 +66,7 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
description:
|
69
|
+
description:
|
70
70
|
email:
|
71
71
|
- open.source@ribose.com
|
72
72
|
executables:
|
@@ -74,21 +74,23 @@ executables:
|
|
74
74
|
extensions: []
|
75
75
|
extra_rdoc_files: []
|
76
76
|
files:
|
77
|
-
- ".github/workflows/
|
77
|
+
- ".github/workflows/python.yml"
|
78
|
+
- ".github/workflows/release.yml"
|
79
|
+
- ".github/workflows/ruby.yml"
|
78
80
|
- ".gitignore"
|
79
81
|
- ".rspec"
|
80
82
|
- ".rubocop.yml"
|
81
83
|
- CODE_OF_CONDUCT.md
|
82
84
|
- Gemfile
|
83
|
-
- README.
|
85
|
+
- README.adoc
|
84
86
|
- Rakefile
|
85
87
|
- bin/console
|
86
88
|
- bin/setup
|
87
89
|
- config/model.yml
|
88
90
|
- data/example.txt
|
89
|
-
- docs/research-arabic-diacritization-06-2021.
|
91
|
+
- docs/research-arabic-diacritization-06-2021.adoc
|
90
92
|
- exe/rababa
|
91
|
-
- lib/README.
|
93
|
+
- lib/README.adoc
|
92
94
|
- lib/rababa.rb
|
93
95
|
- lib/rababa/arabic_constants.rb
|
94
96
|
- lib/rababa/diacritizer.rb
|
@@ -96,18 +98,19 @@ files:
|
|
96
98
|
- lib/rababa/harakats.rb
|
97
99
|
- lib/rababa/reconcile.rb
|
98
100
|
- lib/rababa/version.rb
|
99
|
-
- models-data/README.
|
101
|
+
- models-data/README.adoc
|
100
102
|
- models-data/batch_example_data.pkl
|
101
|
-
- python/README.
|
103
|
+
- python/README.adoc
|
102
104
|
- python/config/baseline.yml
|
103
105
|
- python/config/cbhg.yml
|
106
|
+
- python/config/test_cbhg.yml
|
104
107
|
- python/config_manager.py
|
105
108
|
- python/dataset.py
|
106
109
|
- python/diacritization_model_to_onnx.py
|
107
110
|
- python/diacritize.py
|
108
111
|
- python/diacritizer.py
|
109
|
-
- python/log_dir/CA_MSA.base.cbhg/models/
|
110
|
-
- python/log_dir/README.
|
112
|
+
- python/log_dir/CA_MSA.base.cbhg/models/README.adoc
|
113
|
+
- python/log_dir/README.adoc
|
111
114
|
- python/models/baseline.py
|
112
115
|
- python/models/cbhg.py
|
113
116
|
- python/models/seq2seq.py
|
@@ -116,7 +119,8 @@ files:
|
|
116
119
|
- python/modules/layers.py
|
117
120
|
- python/modules/tacotron_modules.py
|
118
121
|
- python/options.py
|
119
|
-
- python/
|
122
|
+
- python/requirements.txt
|
123
|
+
- python/setup.py
|
120
124
|
- python/test.py
|
121
125
|
- python/tester.py
|
122
126
|
- python/train.py
|
@@ -130,7 +134,7 @@ files:
|
|
130
134
|
- python/util/utils.py
|
131
135
|
- rababa.gemspec
|
132
136
|
- test-datasets/business-cases/examples_with_coutrynames.txt
|
133
|
-
- test-datasets/data-arabic-pointing/
|
137
|
+
- test-datasets/data-arabic-pointing/README.adoc
|
134
138
|
- test-datasets/tashkeela/test.txt
|
135
139
|
- test-datasets/tashkeela/train.txt
|
136
140
|
- test-datasets/tashkeela/val.txt
|
@@ -140,7 +144,7 @@ metadata:
|
|
140
144
|
homepage_uri: https://www.interscript.org
|
141
145
|
source_code_uri: https://github.com/interscript/rababa
|
142
146
|
changelog_uri: https://github.com/interscript/rababa
|
143
|
-
post_install_message:
|
147
|
+
post_install_message:
|
144
148
|
rdoc_options: []
|
145
149
|
require_paths:
|
146
150
|
- lib
|
@@ -148,15 +152,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
148
152
|
requirements:
|
149
153
|
- - ">="
|
150
154
|
- !ruby/object:Gem::Version
|
151
|
-
version: 2.
|
155
|
+
version: 2.5.0
|
152
156
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
153
157
|
requirements:
|
154
158
|
- - ">="
|
155
159
|
- !ruby/object:Gem::Version
|
156
160
|
version: '0'
|
157
161
|
requirements: []
|
158
|
-
rubygems_version: 3.
|
159
|
-
signing_key:
|
162
|
+
rubygems_version: 3.1.6
|
163
|
+
signing_key:
|
160
164
|
specification_version: 4
|
161
165
|
summary: Arabic diacriticizer from Interscript.
|
162
166
|
test_files: []
|
data/.github/workflows/main.yml
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
name: Ruby
|
2
|
-
|
3
|
-
on: [push,pull_request]
|
4
|
-
|
5
|
-
jobs:
|
6
|
-
build:
|
7
|
-
runs-on: ubuntu-latest
|
8
|
-
steps:
|
9
|
-
- uses: actions/checkout@v2
|
10
|
-
- name: Set up Ruby
|
11
|
-
uses: ruby/setup-ruby@v1
|
12
|
-
with:
|
13
|
-
ruby-version: 2.6.6
|
14
|
-
- name: Run the default task
|
15
|
-
run: |
|
16
|
-
gem install bundler -v 2.2.15
|
17
|
-
bundle install
|
18
|
-
bundle exec rake
|
data/README.md
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
# رُبابَة RABABA the Arabic Diacritization Library
|
2
|
-
|
3
|
-
Arabic diacritization is useful for several practical business cases like text
|
4
|
-
to speech or Romanization of Arabic texts or scripts.
|
5
|
-
|
6
|
-
## Purpose
|
7
|
-
|
8
|
-
This repository contains everything to train a diacritization model in Python
|
9
|
-
and run it in Python and Ruby.
|
10
|
-
|
11
|
-
## Try out Rababa
|
12
|
-
|
13
|
-
Rababa can be run both in python and ruby. Go the directory corresponding to the language you prefer to use. Indications are in the README's, under the "Try out Rababa" section:
|
14
|
-
* [Python](https://github.com/interscript/rababa/tree/master/python)
|
15
|
-
* [Ruby](https://github.com/interscript/rababa/tree/master/lib)
|
16
|
-
|
17
|
-
## Library
|
18
|
-
|
19
|
-
This library was built for the
|
20
|
-
[Interscript project](https://www.interscript.org)
|
21
|
-
([at GitHub](https://github.com/interscript/)).
|
22
|
-
|
23
|
-
Diacritization strategy is following several steps with at heart a deep learning
|
24
|
-
model:
|
25
|
-
|
26
|
-
1. text preprocessing
|
27
|
-
2. neural networks model prediction
|
28
|
-
3. text postprocessing
|
29
|
-
|
30
|
-
This repository contains:
|
31
|
-
|
32
|
-
- [lib](https://github.com/interscript/rababa/tree/master/lib) is
|
33
|
-
the Ruby library using NNet model in ONNX format.
|
34
|
-
|
35
|
-
- [docs](https://github.com/interscript/rababa/tree/master/docs)
|
36
|
-
contains an application focused summary of latest (2021-06) relevant papers
|
37
|
-
and solutions.
|
38
|
-
|
39
|
-
- [python](https://github.com/interscript/rababa/tree/master/python)
|
40
|
-
- A **neural network solution** for automatised diacritization based on the
|
41
|
-
work of [almodhfer](https://github.com/almodhfer/Arabic_Diacritization),
|
42
|
-
from which we overtook the baseline and more advanced and efficient CBHG
|
43
|
-
models only. This very recent solution allows for efficient predictions on
|
44
|
-
CPU's with a reasonable sized model.
|
45
|
-
|
46
|
-
* **PyTorch to ONNX** conversion of PyTorch to ONNX format
|
47
|
-
|
48
|
-
* **Strings Pre-/Post-processing**, also from
|
49
|
-
[almodhfer](https://github.com/almodhfer/Arabic_Diacritization)
|
50
|
-
|
51
|
-
- [tests and benchmarking utilities](https://github.com/interscript/rababa/tree/master/tests-benchmarks),
|
52
|
-
allowing to compare with other implementations.
|
53
|
-
|
54
|
-
* tests are are taken from
|
55
|
-
[diacritization benchmarking](https://github.com/AliOsm/arabic-text-diacritization)
|
56
|
-
|
57
|
-
* we have added own, realistic datasets for the problem of diacritization
|
58
|
-
|
59
|
-
- **models-data** directory to store models and embeddings in various formats
|
60
|
-
|
61
|
-
## About the Name
|
62
|
-
|
63
|
-
A https://en.wikipedia.org/wiki/Rebab[Rababa] is an antique string instrument.
|
64
|
-
|
65
|
-
In a similar fashion that a Rababa produces melody from a simple strings and
|
66
|
-
pieces of wood, our library and diacritization gives a whole palette of colour
|
67
|
-
and meanings to arabic scripts.
|
68
|
-
|
69
|
-
## Under development
|
70
|
-
|
71
|
-
We are working on the following improvements:
|
72
|
-
* Preprocessing for breaking down large sentences
|
73
|
-
* PoS tagging and search to improve the diacritization
|
data/lib/README.md
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
# Arabic Diacritization in Ruby with Rababa
|
2
|
-
|
3
|
-
## Try out Rababa
|
4
|
-
|
5
|
-
* Install the Gems listed below
|
6
|
-
* Download a ruby model on [releases](https://github.com/secryst/rababa-models)
|
7
|
-
|
8
|
-
### Run examples
|
9
|
-
|
10
|
-
Prerequisite:
|
11
|
-
|
12
|
-
* Please download the `diacritization_model_max_len_200.onnx` model file
|
13
|
-
from https://github.com/secryst/rababa-models/releases/tag/0.1
|
14
|
-
|
15
|
-
One can diacritize either single strings:
|
16
|
-
|
17
|
-
```sh
|
18
|
-
rababa -t 'قطر' -m diacritization_model_max_len_200.onnx
|
19
|
-
# or when inside the gem directory during development
|
20
|
-
bundle exec exe/rababa -t 'قطر' -m diacritization_model_max_len_200.onnx
|
21
|
-
```
|
22
|
-
|
23
|
-
Or files as `data/examples.txt` or your own Arabic file (the max string length
|
24
|
-
is specified in the model and has to match the `max_len` parameter in
|
25
|
-
`config/models.yaml`):
|
26
|
-
|
27
|
-
```sh
|
28
|
-
rababa -f data/example.txt -m diacritization_model_max_len_200.onnx
|
29
|
-
# or when inside the gem directory during development
|
30
|
-
bundle exec exe/rababa -f data/example.txt -m diacritization_model_max_len_200.onnx
|
31
|
-
```
|
32
|
-
|
33
|
-
One would have to preprocess generic arabic texts for running Rababa in general.
|
34
|
-
This can be done on sentences beginnings running for instance
|
35
|
-
[Hamza5](https://github.com/Hamza5/Pipeline-diacritizer):
|
36
|
-
|
37
|
-
```
|
38
|
-
python __main__.py preprocess source destination
|
39
|
-
```
|
40
|
-
|
41
|
-
### ONNX Models
|
42
|
-
|
43
|
-
They can either be built in the `/python` repository or downloaded from the
|
44
|
-
[releases](https://github.com/secryst/rababa-models).
|
45
|
-
|
46
|
-
Or ONNX model can be generated running the python
|
47
|
-
[code](https://github.com/interscript/rababa/blob/master/python/diacritization_model_to_onnx.py)
|
48
|
-
in this library.
|
49
|
-
|
50
|
-
It requires to go through some of the steps described in the link above.
|
51
|
-
|
52
|
-
### Parameters
|
53
|
-
|
54
|
-
* text to diacritize: "**-t**TEXT", "--text=TEXT",
|
55
|
-
* path to file to diacritize: "**-f**FILE", "--text_filename=FILE",
|
56
|
-
* path to ONNX model **Mandatory**: "-mMODEL", "--model_file=MODEL",
|
57
|
-
* path to config file **Default:config/model.yml**: "-cCONFIG", "--config=CONFIG"
|
58
|
-
|
59
|
-
### Config
|
60
|
-
|
61
|
-
#### Players:
|
62
|
-
|
63
|
-
* max_len: 200 -- 600
|
64
|
-
* Parameter that has to match the ONNX model built using the
|
65
|
-
[code]{https://github.com/interscript/rababa/blob/master/python/diacritization_model_to_onnx.py}
|
66
|
-
and following the python/Readme.md.
|
67
|
-
* Longer sentences will need to be preprocessed, which can be done for
|
68
|
-
instance using [Hamza5](https://github.com/Hamza5)
|
69
|
-
[code](https://github.com/Hamza5/Pipeline-diacritizer/blob/master/pipeline_diacritizer/pipeline_diacritizer.py).
|
70
|
-
* the smaller the faster the nnets code.
|
71
|
-
* text_encoder corresponding to the [rules](https://github.com/interscript/rababa/blob/master/python/util/text_encoders.py):
|
72
|
-
* BasicArabicEncoder
|
73
|
-
* ArabicEncoderWithStartSymbol
|
74
|
-
* text_cleaner corresponding to [logics](https://github.com/interscript/rababa/blob/master/python/util/text_cleaners.py):
|
75
|
-
* basic_cleaners: remove redundancy in whitespaces and strip string
|
76
|
-
* valid_arabic_cleaners: basic+filter of only arabic words
|
77
|
-
|
78
|
-
### Gems
|
79
|
-
|
80
|
-
```sh
|
81
|
-
gem install rababa
|
82
|
-
```
|