rababa 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/python.yml +81 -0
  3. data/.github/workflows/release.yml +36 -0
  4. data/.github/workflows/ruby.yml +27 -0
  5. data/.gitignore +3 -0
  6. data/.rubocop.yml +1 -1
  7. data/CODE_OF_CONDUCT.md +13 -13
  8. data/README.adoc +80 -0
  9. data/Rakefile +1 -1
  10. data/docs/{research-arabic-diacritization-06-2021.md → research-arabic-diacritization-06-2021.adoc} +52 -37
  11. data/exe/rababa +1 -1
  12. data/lib/README.adoc +95 -0
  13. data/lib/rababa/diacritizer.rb +16 -8
  14. data/lib/rababa/encoders.rb +2 -2
  15. data/lib/rababa/harakats.rb +1 -1
  16. data/lib/rababa/reconcile.rb +1 -33
  17. data/lib/rababa/version.rb +1 -1
  18. data/models-data/README.adoc +6 -0
  19. data/python/README.adoc +211 -0
  20. data/python/config/cbhg.yml +1 -1
  21. data/python/config/test_cbhg.yml +51 -0
  22. data/python/dataset.py +23 -31
  23. data/python/diacritization_model_to_onnx.py +216 -15
  24. data/python/diacritizer.py +35 -31
  25. data/python/log_dir/CA_MSA.base.cbhg/models/README.adoc +2 -0
  26. data/python/log_dir/README.adoc +1 -0
  27. data/python/{requirement.txt → requirements.txt} +1 -1
  28. data/python/setup.py +32 -0
  29. data/python/trainer.py +10 -4
  30. data/python/util/reconcile_original_plus_diacritized.py +2 -0
  31. data/python/util/text_cleaners.py +59 -4
  32. data/rababa.gemspec +1 -1
  33. data/test-datasets/data-arabic-pointing/{Readme.md → README.adoc} +2 -1
  34. metadata +22 -18
  35. data/.github/workflows/main.yml +0 -18
  36. data/README.md +0 -73
  37. data/lib/README.md +0 -82
  38. data/models-data/README.md +0 -6
  39. data/python/README.md +0 -163
  40. data/python/log_dir/CA_MSA.base.cbhg/models/Readme.md +0 -2
  41. data/python/log_dir/README.md +0 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d171ba914bf49b5ff592722ac42382737a431883afc4220dcc0a1b785c3b5273
4
- data.tar.gz: fc0b1db20509b60d5bac3819705f2c8591ab1b596996190a51a56f8b1094e3a5
3
+ metadata.gz: 0fe110940a4f0173f919bcf2f8e9d33e1dcd21ac775e52619e11cc37860b17cb
4
+ data.tar.gz: 750defe96bdc852a066585c7f713daf7b17dc5f6509dfaf567bafb5797b9929b
5
5
  SHA512:
6
- metadata.gz: 46ea6eb725f2460116ef229175ac3d52287a42b0b20b9e47c6802e2c7bde06a98acae23118bd338d7ad68d600a4e5d8116e4dedc3c4657e5263c5ff854a8f182
7
- data.tar.gz: '019241258b1e1d346458aebd1a21c309220fe3ced90246e90852adbc012a5e21b5bd6f96a646a509d6685646aacbd7b71a26619371b2e4ff53141c07ab88db53'
6
+ metadata.gz: 380fa14e57e3fba948d609987e7e076c53d5e9c8492f7219b77d7b51a538122091b0c5176585b1bbf9be5bf63f0e17ff1fa0d2510063ea6b2c656fd50be02476
7
+ data.tar.gz: 441cc2614664238a6ba2230f4e2a13e239ee7cf8dfcc0ce8b21c8995f704ea40d985fea22a8dcea1e58d8f5ecf980101e64c45234a4a945046dfb42bbfc71b65
@@ -0,0 +1,81 @@
1
+ name: python
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+
8
+ jobs:
9
+ infer:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ python-version: ['3.6', '3.7', '3.8', '3.9']
15
+
16
+ steps:
17
+ - uses: actions/checkout@v2
18
+ - uses: actions/setup-python@v2
19
+ with:
20
+ python-version: ${{ matrix.python-version }}
21
+
22
+ - uses: actions/cache@v2
23
+ with:
24
+ path: ${{ env.pythonLocation }}
25
+ key: ${{ env.pythonLocation }}-${{ hashFiles('python/setup.py') }}-${{ hashFiles('python/requirements.txt') }}
26
+
27
+ - name: Install requirements
28
+ working-directory: ./python
29
+ run: |
30
+ pip install --upgrade --upgrade-strategy eager -r requirements.txt -e .
31
+
32
+ - name: Download PyTorch model
33
+ working-directory: ./python
34
+ run: |
35
+ curl -sSL https://github.com/secryst/rababa-models/releases/download/0.1/2000000-snapshot.pt \
36
+ -o log_dir/CA_MSA.base.cbhg/models/2000000-snapshot.pt
37
+
38
+ - name: Run diacriticization
39
+ working-directory: ./python
40
+ run: |
41
+ python diacritize.py --model_kind "cbhg" --config config/cbhg.yml --text 'قطر'
42
+
43
+ train:
44
+ runs-on: ubuntu-latest
45
+ strategy:
46
+ fail-fast: false
47
+ matrix:
48
+ python-version: ['3.6', '3.7', '3.8', '3.9']
49
+
50
+ steps:
51
+ - uses: actions/checkout@v2
52
+ - uses: actions/setup-python@v2
53
+ with:
54
+ python-version: ${{ matrix.python-version }}
55
+
56
+ - uses: actions/cache@v2
57
+ with:
58
+ path: ${{ env.pythonLocation }}
59
+ key: ${{ env.pythonLocation }}-${{ hashFiles('python/setup.py') }}-${{ hashFiles('python/requirements.txt') }}
60
+
61
+ - name: Install requirements
62
+ working-directory: ./python
63
+ run: |
64
+ pip install --upgrade --upgrade-strategy eager -r requirements.txt -e .
65
+
66
+ - name: Prepare dataset
67
+ working-directory: ./python
68
+ run: |
69
+ mkdir -p data/CA_MSA
70
+ touch data/CA_MSA/{eval,train,test}.csv
71
+ cd data
72
+ curl -sSL https://github.com/interscript/rababa-tashkeela/archive/refs/tags/v1.0.zip -o tashkeela.zip
73
+ unzip tashkeela.zip
74
+ for d in `ls rababa-tashkeela-1.0/tashkeela_val/*`; do cat $d >> CA_MSA/eval.csv; done
75
+ for d in `ls rababa-tashkeela-1.0/tashkeela_train/*`; do cat $d >> CA_MSA/train.csv; done
76
+ for d in `ls rababa-tashkeela-1.0/tashkeela_test/*`; do cat $d >> CA_MSA/test.csv; done
77
+
78
+ - name: Try training (WIP)
79
+ working-directory: ./python
80
+ run: |
81
+ python train.py --model "cbhg" --config config/test_cbhg.yml
@@ -0,0 +1,36 @@
1
+ name: release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ jobs:
9
+ release:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+
14
+ - uses: actions/setup-ruby@v1
15
+ with:
16
+ ruby-version: '2.7'
17
+ architecture: 'x64'
18
+
19
+ - run: bundle install --jobs 4 --retry 3
20
+
21
+ - name: Test the Ruby package
22
+ run: bundle exec rake
23
+
24
+ - name: Publish to rubygems.org
25
+ env:
26
+ RUBYGEMS_API_KEY: ${{secrets.INTERSCRIPT_RUBYGEMS_API_KEY}}
27
+ run: |
28
+ gem install gem-release
29
+ touch ~/.gem/credentials
30
+ cat > ~/.gem/credentials << EOF
31
+ ---
32
+ :rubygems_api_key: ${RUBYGEMS_API_KEY}
33
+ EOF
34
+ chmod 0600 ~/.gem/credentials
35
+ git status
36
+ gem release
@@ -0,0 +1,27 @@
1
+ name: ruby
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ ruby-version: ['2.6', '2.7', '3.0']
15
+
16
+ steps:
17
+ - uses: actions/checkout@v2
18
+
19
+ - name: Set up Ruby
20
+ uses: ruby/setup-ruby@v1
21
+ with:
22
+ ruby-version: ${{ matrix.ruby-version }}
23
+ bundler-cache: true
24
+
25
+ - name: Run rake
26
+ run: |
27
+ bundle exec rake
data/.gitignore CHANGED
@@ -9,3 +9,6 @@
9
9
 
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
+
13
+ *.onnx
14
+ Gemfile.lock
data/.rubocop.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  AllCops:
2
- TargetRubyVersion: 2.4
2
+ TargetRubyVersion: 2.5
3
3
 
4
4
  Style/StringLiterals:
5
5
  Enabled: true
data/CODE_OF_CONDUCT.md CHANGED
@@ -1,12 +1,12 @@
1
- # Contributor Covenant Code of Conduct
1
+ = Contributor Covenant Code of Conduct
2
2
 
3
- ## Our Pledge
3
+ == Our Pledge
4
4
 
5
5
  We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
6
6
 
7
7
  We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
8
8
 
9
- ## Our Standards
9
+ == Our Standards
10
10
 
11
11
  Examples of behavior that contributes to a positive environment for our community include:
12
12
 
@@ -27,56 +27,56 @@ Examples of unacceptable behavior include:
27
27
  * Other conduct which could reasonably be considered inappropriate in a
28
28
  professional setting
29
29
 
30
- ## Enforcement Responsibilities
30
+ == Enforcement Responsibilities
31
31
 
32
32
  Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful.
33
33
 
34
34
  Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate.
35
35
 
36
- ## Scope
36
+ == Scope
37
37
 
38
38
  This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event.
39
39
 
40
- ## Enforcement
40
+ == Enforcement
41
41
 
42
42
  Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at ronald.tse@ribose.com. All complaints will be reviewed and investigated promptly and fairly.
43
43
 
44
44
  All community leaders are obligated to respect the privacy and security of the reporter of any incident.
45
45
 
46
- ## Enforcement Guidelines
46
+ == Enforcement Guidelines
47
47
 
48
48
  Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct:
49
49
 
50
- ### 1. Correction
50
+ === 1. Correction
51
51
 
52
52
  **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community.
53
53
 
54
54
  **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested.
55
55
 
56
- ### 2. Warning
56
+ === 2. Warning
57
57
 
58
58
  **Community Impact**: A violation through a single incident or series of actions.
59
59
 
60
60
  **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban.
61
61
 
62
- ### 3. Temporary Ban
62
+ === 3. Temporary Ban
63
63
 
64
64
  **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior.
65
65
 
66
66
  **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban.
67
67
 
68
- ### 4. Permanent Ban
68
+ === 4. Permanent Ban
69
69
 
70
70
  **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals.
71
71
 
72
72
  **Consequence**: A permanent ban from any sort of public interaction within the community.
73
73
 
74
- ## Attribution
74
+ == Attribution
75
75
 
76
76
  This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0,
77
77
  available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
78
78
 
79
- Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity).
79
+ Community Impact Guidelines were inspired by https://github.com/mozilla/diversity[Mozilla's code of conduct enforcement ladder].
80
80
 
81
81
  [homepage]: https://www.contributor-covenant.org
82
82
 
data/README.adoc ADDED
@@ -0,0 +1,80 @@
1
+ = رُبابَة RABABA the Arabic Diacritization Library
2
+
3
+ Arabic diacritization is useful for several practical business cases like text
4
+ to speech or Romanization of Arabic texts or scripts.
5
+
6
+ == Purpose
7
+
8
+ This repository contains everything to train a diacritization model in Python
9
+ and run it in Python and Ruby.
10
+
11
+ == Try out Rababa
12
+
13
+ Rababa can be run both in Python and Ruby. Go the directory corresponding to the
14
+ language you prefer to use.
15
+
16
+ Please see the following README's, under the "`Try out Rababa`" section:
17
+
18
+ * https://github.com/interscript/rababa/tree/main/python[Python]
19
+ * https://github.com/interscript/rababa/tree/main/lib[Ruby]
20
+
21
+ == Library
22
+
23
+ This library was built for the
24
+ https://www.interscript.org[Interscript project]
25
+ (https://github.com/interscript/)[at GitHub].
26
+
27
+ Diacritization strategy is following several steps with at heart a deep learning
28
+ model:
29
+
30
+ . text preprocessing
31
+ . neural networks model prediction
32
+ . text postprocessing
33
+
34
+ This repository contains:
35
+
36
+ * https://github.com/interscript/rababa/tree/main/lib[lib] is
37
+ the Ruby library using NNet model in ONNX format.
38
+
39
+ * https://github.com/interscript/rababa/tree/main/docs[docs]
40
+ contains an application focused summary of latest (2021-06) relevant papers
41
+ and solutions.
42
+
43
+ * https://github.com/interscript/rababa/tree/main/python[python]
44
+
45
+ ** A *neural network solution* for automatised diacritization based on the
46
+ work of https://github.com/almodhfer/Arabic_Diacritization[almodhfer],
47
+ from which we overtook the baseline and more advanced and efficient CBHG
48
+ models only. This very recent solution allows for efficient predictions on
49
+ CPU's with a reasonable sized model.
50
+
51
+ ** **PyTorch to ONNX** conversion of PyTorch to ONNX format
52
+
53
+ ** **Strings Pre-/Post-processing**, also from
54
+ https://github.com/almodhfer/Arabic_Diacritization[almodhfer]
55
+
56
+ * https://github.com/interscript/rababa/tree/main/tests-benchmarks[tests and benchmarking utilities],
57
+ allowing to compare with other implementations.
58
+
59
+ ** tests are taken from
60
+ https://github.com/AliOsm/arabic-text-diacritization[diacritization benchmarking]
61
+
62
+ ** we have added own, realistic datasets for the problem of diacritization
63
+
64
+ * **models-data** directory to store models and embeddings in various formats
65
+
66
+
67
+ == About the name
68
+
69
+ A https://en.wikipedia.org/wiki/Rebab[Rababa] is an antique string instrument.
70
+
71
+ In a similar fashion that a Rababa produces melody from a simple strings and
72
+ pieces of wood, our library and diacritization gives a whole palette of colour
73
+ and meanings to arabic scripts.
74
+
75
+ == Under development
76
+
77
+ We are working on the following improvements:
78
+
79
+ * Preprocessing for breaking down large sentences
80
+ * PoS tagging and search to improve the diacritization
data/Rakefile CHANGED
@@ -9,4 +9,4 @@ require "rubocop/rake_task"
9
9
 
10
10
  RuboCop::RakeTask.new
11
11
 
12
- task default: %i[spec rubocop]
12
+ task default: %i[spec]# rubocop]
@@ -1,4 +1,4 @@
1
- # Literature and Codes
1
+ = Literature and Codes
2
2
 
3
3
  Last updated: 2021-06.
4
4
 
@@ -9,74 +9,89 @@ Older solutions used rules based approaches.
9
9
  Deep Learning was applied relatively to the problem of diacritization, gradually
10
10
  getting better results than rules based approaches.
11
11
 
12
+ == References
13
+
12
14
  **Mishkal, Arabic text vocalization software**
13
- Zerrouki, T.
14
- rules based library, 2014
15
- * [code](https://github.com/linuxscout/mishkal)
15
+
16
+ * Zerrouki, T.
17
+ * rules based library, 2014
18
+ * https://github.com/linuxscout/mishkal[code]
16
19
 
17
20
  **Automatic minimal diacritization of Arabic texts**
18
- Rehab Alnefaiea, Aqil M.Azmib
19
- 11.2017
21
+
22
+ * Rehab Alnefaiea, Aqil M.Azmib
23
+ * 11.2017
20
24
  * MADAMIRA software
21
- * [paper](https://www.sciencedirect.com/science/article/pii/S1877050917321634)
25
+ * https://www.sciencedirect.com/science/article/pii/S1877050917321634[paper]
22
26
 
23
27
  **An Approach for Arabic Diacritization**
24
- Ismail Hadjir, Mohamed Abbache, Fatma Zohra Belkredim
25
- 06.2019
28
+
29
+ * Ismail Hadjir, Mohamed Abbache, Fatma Zohra Belkredim
30
+ * 06.2019
26
31
  * keywords: Hidden Markov Models, Viterbi algorithm
27
- * [article](https://link.springer.com/chapter/10.1007/978-3-030-23281-8_29)
32
+ * https://link.springer.com/chapter/10.1007/978-3-030-23281-8_29[article]
28
33
 
29
34
  **Diacritization of Moroccan and Tunisian Arabic Dialects: A CRF Approach**
30
- Kareem Darwish∗, Ahmed Abdelali∗, Hamdy Mubarak∗, Younes Samih†, Mohammed Attia⋆
31
- 2018
35
+
36
+ * Kareem Darwish, Ahmed Abdelali, Hamdy Mubarak, Younes Samih, Mohammed Attia
37
+ * 2018
32
38
  * keywords: Conditional Random Fields, arabic dialects...
33
- * [paper](http://lrec-conf.org/workshops/lrec2018/W30/pdf/20_W30.pdf)
39
+ * http://lrec-conf.org/workshops/lrec2018/W30/pdf/20_W30.pdf[paper]
34
40
 
35
41
  **Arabic Text Diacritization Using Deep Neural Networks**
36
- Ali Fadel, Ibraheem Tuffaha, Bara' Al-Jawarneh, Mahmoud Al-Ayyoub
37
- **Shakkala** library, tensorflow, 04.2019
42
+
43
+ * Ali Fadel, Ibraheem Tuffaha, Bara' Al-Jawarneh, Mahmoud Al-Ayyoub
44
+ * **Shakkala** library, tensorflow
45
+ * 04.2019
38
46
  * keywords: Embedding, LSTM
39
- * [paper](https://arxiv.org/abs/1905.01965)
40
- * [code](https://github.com/Barqawiz/Shakkala), tensorflow
41
- * [benchmarks&scripts](https://github.com/AliOsm/arabic-text-diacritization)
47
+ * https://arxiv.org/abs/1905.01965[paper]
48
+ * https://github.com/Barqawiz/Shakkala[code], tensorflow
49
+ * https://github.com/AliOsm/arabic-text-diacritization[benchmarks&scripts]
42
50
 
43
51
  **Highly Effective Arabic Diacritization using Sequence to Sequence Modeling**
52
+
44
53
  * Hamdy Mubarak, Ahmed Abdelali, Hassan Sajjad, Younes Samih, Kareem Darwish
45
- 06.2019
54
+ * 06.2019
46
55
  * keywords: seq2seq(LSTM), NMT, interesting representation units, context window, voting
47
- * [paper](https://www.aclweb.org/anthology/N19-1248.pdf)
56
+ * https://www.aclweb.org/anthology/N19-1248.pdf[paper]
48
57
 
49
58
  **Multi-components System for Automatic Arabic Diacritization**
50
- Hamza Abbad, Shengwu Xiong
51
- 04.2020
59
+
60
+ * Hamza Abbad, Shengwu Xiong
61
+ * 04.2020
52
62
  * keywords: LSTM's, parallel layers for Shadda and Harakat (⇒ pipeline)
53
- * [paper](https://paperswithcode.com/paper/multi-components-system-for-automatic-arabic)
54
- * [code](https://github.com/Hamza5/Pipeline-diacritizer), tensorflow
63
+ * https://paperswithcode.com/paper/multi-components-system-for-automatic-arabic[paper]
64
+ * https://github.com/Hamza5/Pipeline-diacritizer[code], tensorflow
55
65
 
56
66
  **Deep Diacritization: Efficient Hierarchical Recurrence for Improved Arabic Diacritization**
57
- Badr AlKhamissi, Muhammad N. ElNokrashy, and Mohamed Gabr
58
- 12.2020
67
+
68
+ * Badr AlKhamissi, Muhammad N. ElNokrashy, and Mohamed Gabr
69
+ * 12.2020
59
70
  * keywords: Cross-level attention, Encoder-Decoder (LSTM), Teacher forcing,
60
- * [paper](https://www.aclweb.org/anthology/2020.wanlp-1.4.pdf)
61
- * [slides](https://drive.google.com/file/d/1GzXRIddVeJRCge74QaRC67M1I-pAoGV3/view)
62
- * [code](https://github.com/BKHMSI/deep-diacritization), pytorch
71
+ * https://www.aclweb.org/anthology/2020.wanlp-1.4.pdf[paper]
72
+ * https://drive.google.com/file/d/1GzXRIddVeJRCge74QaRC67M1I-pAoGV3/view[slides]
73
+ * https://github.com/BKHMSI/deep-diacritization[code], pytorch
63
74
 
64
75
  **Effective Deep Learning Models for Automatic Diacritization of Arabic Text**
65
- Mokthar Ali Hasan Madhfar; Ali Mustafa Qamar
66
- 12.2020
76
+
77
+ * Mokthar Ali Hasan Madhfar; Ali Mustafa Qamar
78
+ * 12.2020
67
79
  * keywords: embedding, encoder-decoder (LSTM), Highway Nets, Attention, CBHG Module
68
- * [paper](https://paperswithcode.com/paper/effective-deep-learning-models-for-automatic)
69
- * [code](https://github.com/almodhfer/Arabic_Diacritization), pytorch
80
+ * https://paperswithcode.com/paper/effective-deep-learning-models-for-automatic[paper]
81
+ * https://github.com/almodhfer/Arabic_Diacritization[code], pytorch
70
82
 
71
83
  **A Deep Belief Network Classification Approach for Automatic Diacritization of Arabic Text**
72
- Mohammad Aref Alshraideh, Mohammad Alshraideh and Omar Alkadi
73
- 4.2021
84
+
85
+ * Mohammad Aref Alshraideh, Mohammad Alshraideh and Omar Alkadi
86
+ * 4.2021
74
87
  * keywords: DBN built with Boltzmann restricted machines (restricted RBM's) superior to LSTMs, unicode encoding, Borderline-SMOTE
75
- * [paper](https://www.researchgate.net/publication/352226815_A_Deep_Belief_Network_Classification_Approach_for_Automatic_Diacritization_of_Arabic_Text)
88
+ * https://www.researchgate.net/publication/352226815_A_Deep_Belief_Network_Classification_Approach_for_Automatic_Diacritization_of_Arabic_Text[paper]
89
+
76
90
 
91
+ == Research ideas
77
92
 
78
- # Research ideas
79
93
  Here we just mention some 2021-ish ideas mentioned in the recent papers above:
94
+
80
95
  * Transformer-based Encoders
81
96
  * Byte-pair-encodings
82
97
  * Improve Injected Hints Method (train with semi diacritised data)