precise 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +64 -0
- data/LICENSE.md +163 -0
- data/README.md +41 -0
- data/Rakefile +25 -0
- data/TODO.md +42 -0
- data/exe/precise +10 -0
- data/lib/precise/core_extensions.rb +24 -0
- data/lib/precise/debugging.rb +1 -0
- data/lib/precise/error_classes.rb +19 -0
- data/lib/precise/transcription.rb +11 -0
- data/lib/precise/transcription_a2r.rb +123 -0
- data/lib/precise/transcription_r2a.rb +447 -0
- data/lib/precise/types_list.rb +37 -0
- data/lib/precise/version.rb +5 -0
- data/lib/precise.rb +67 -0
- data/precise.gemspec +35 -0
- metadata +105 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: de5da64054f1063eb0129a6aa14d03bc457ba1cdc68d9748618210e0c414f001
|
|
4
|
+
data.tar.gz: 738931cb458919672e14d9af523e18bfc9aecbde2b3efecefc4b85daa0e44446
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 7d6b2d48809192ad96abcc2401feb6598fdaed2a30f039b2d0f0da13fd98d9ac76b4b6233fbaf1e17e2fe4825c1530e97a692f0d9332380d67f0dbb22d295f9b
|
|
7
|
+
data.tar.gz: 896f8c9cf2f58415d623e26ec69491f5e697166057faa7b5a692c127a227dd73999ab96c0e40e2bb4f267bb7441cba78a62e669ddb71e4ec5f3f2603bff1fabf
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
precise (0.1.8)
|
|
5
|
+
progressbar
|
|
6
|
+
slop
|
|
7
|
+
tiny_color
|
|
8
|
+
|
|
9
|
+
GEM
|
|
10
|
+
remote: https://rubygems.org/
|
|
11
|
+
specs:
|
|
12
|
+
ast (2.4.2)
|
|
13
|
+
coderay (1.1.3)
|
|
14
|
+
json (2.6.3)
|
|
15
|
+
language_server-protocol (3.17.0.3)
|
|
16
|
+
method_source (1.0.0)
|
|
17
|
+
minitest (5.18.0)
|
|
18
|
+
parallel (1.22.1)
|
|
19
|
+
parser (3.2.1.1)
|
|
20
|
+
ast (~> 2.4.1)
|
|
21
|
+
progressbar (1.13.0)
|
|
22
|
+
pry (0.14.2)
|
|
23
|
+
coderay (~> 1.1)
|
|
24
|
+
method_source (~> 1.0)
|
|
25
|
+
rainbow (3.1.1)
|
|
26
|
+
rake (13.0.6)
|
|
27
|
+
regexp_parser (2.7.0)
|
|
28
|
+
rexml (3.2.5)
|
|
29
|
+
rubocop (1.44.1)
|
|
30
|
+
json (~> 2.3)
|
|
31
|
+
parallel (~> 1.10)
|
|
32
|
+
parser (>= 3.2.0.0)
|
|
33
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
34
|
+
regexp_parser (>= 1.8, < 3.0)
|
|
35
|
+
rexml (>= 3.2.5, < 4.0)
|
|
36
|
+
rubocop-ast (>= 1.24.1, < 2.0)
|
|
37
|
+
ruby-progressbar (~> 1.7)
|
|
38
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
|
39
|
+
rubocop-ast (1.27.0)
|
|
40
|
+
parser (>= 3.2.1.0)
|
|
41
|
+
rubocop-performance (1.15.2)
|
|
42
|
+
rubocop (>= 1.7.0, < 2.0)
|
|
43
|
+
rubocop-ast (>= 0.4.0)
|
|
44
|
+
ruby-progressbar (1.13.0)
|
|
45
|
+
slop (4.10.1)
|
|
46
|
+
standard (1.24.3)
|
|
47
|
+
language_server-protocol (~> 3.17.0.2)
|
|
48
|
+
rubocop (= 1.44.1)
|
|
49
|
+
rubocop-performance (= 1.15.2)
|
|
50
|
+
tiny_color (1.2.2)
|
|
51
|
+
unicode-display_width (2.4.2)
|
|
52
|
+
|
|
53
|
+
PLATFORMS
|
|
54
|
+
x86_64-linux
|
|
55
|
+
|
|
56
|
+
DEPENDENCIES
|
|
57
|
+
minitest
|
|
58
|
+
precise!
|
|
59
|
+
pry
|
|
60
|
+
rake
|
|
61
|
+
standard
|
|
62
|
+
|
|
63
|
+
BUNDLED WITH
|
|
64
|
+
2.4.1
|
data/LICENSE.md
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
GNU Lesser General Public License
|
|
2
|
+
=================================
|
|
3
|
+
|
|
4
|
+
_Version 3, 29 June 2007_
|
|
5
|
+
_Copyright © 2007 Free Software Foundation, Inc. <<http://fsf.org/>>_
|
|
6
|
+
|
|
7
|
+
Everyone is permitted to copy and distribute verbatim copies
|
|
8
|
+
of this license document, but changing it is not allowed.
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
This version of the GNU Lesser General Public License incorporates
|
|
12
|
+
the terms and conditions of version 3 of the GNU General Public
|
|
13
|
+
License, supplemented by the additional permissions listed below.
|
|
14
|
+
|
|
15
|
+
### 0. Additional Definitions
|
|
16
|
+
|
|
17
|
+
As used herein, “this License” refers to version 3 of the GNU Lesser
|
|
18
|
+
General Public License, and the “GNU GPL” refers to version 3 of the GNU
|
|
19
|
+
General Public License.
|
|
20
|
+
|
|
21
|
+
“The Library” refers to a covered work governed by this License,
|
|
22
|
+
other than an Application or a Combined Work as defined below.
|
|
23
|
+
|
|
24
|
+
An “Application” is any work that makes use of an interface provided
|
|
25
|
+
by the Library, but which is not otherwise based on the Library.
|
|
26
|
+
Defining a subclass of a class defined by the Library is deemed a mode
|
|
27
|
+
of using an interface provided by the Library.
|
|
28
|
+
|
|
29
|
+
A “Combined Work” is a work produced by combining or linking an
|
|
30
|
+
Application with the Library. The particular version of the Library
|
|
31
|
+
with which the Combined Work was made is also called the “Linked
|
|
32
|
+
Version”.
|
|
33
|
+
|
|
34
|
+
The “Minimal Corresponding Source” for a Combined Work means the
|
|
35
|
+
Corresponding Source for the Combined Work, excluding any source code
|
|
36
|
+
for portions of the Combined Work that, considered in isolation, are
|
|
37
|
+
based on the Application, and not on the Linked Version.
|
|
38
|
+
|
|
39
|
+
The “Corresponding Application Code” for a Combined Work means the
|
|
40
|
+
object code and/or source code for the Application, including any data
|
|
41
|
+
and utility programs needed for reproducing the Combined Work from the
|
|
42
|
+
Application, but excluding the System Libraries of the Combined Work.
|
|
43
|
+
|
|
44
|
+
### 1. Exception to Section 3 of the GNU GPL
|
|
45
|
+
|
|
46
|
+
You may convey a covered work under sections 3 and 4 of this License
|
|
47
|
+
without being bound by section 3 of the GNU GPL.
|
|
48
|
+
|
|
49
|
+
### 2. Conveying Modified Versions
|
|
50
|
+
|
|
51
|
+
If you modify a copy of the Library, and, in your modifications, a
|
|
52
|
+
facility refers to a function or data to be supplied by an Application
|
|
53
|
+
that uses the facility (other than as an argument passed when the
|
|
54
|
+
facility is invoked), then you may convey a copy of the modified
|
|
55
|
+
version:
|
|
56
|
+
|
|
57
|
+
* **a)** under this License, provided that you make a good faith effort to
|
|
58
|
+
ensure that, in the event an Application does not supply the
|
|
59
|
+
function or data, the facility still operates, and performs
|
|
60
|
+
whatever part of its purpose remains meaningful, or
|
|
61
|
+
|
|
62
|
+
* **b)** under the GNU GPL, with none of the additional permissions of
|
|
63
|
+
this License applicable to that copy.
|
|
64
|
+
|
|
65
|
+
### 3. Object Code Incorporating Material from Library Header Files
|
|
66
|
+
|
|
67
|
+
The object code form of an Application may incorporate material from
|
|
68
|
+
a header file that is part of the Library. You may convey such object
|
|
69
|
+
code under terms of your choice, provided that, if the incorporated
|
|
70
|
+
material is not limited to numerical parameters, data structure
|
|
71
|
+
layouts and accessors, or small macros, inline functions and templates
|
|
72
|
+
(ten or fewer lines in length), you do both of the following:
|
|
73
|
+
|
|
74
|
+
* **a)** Give prominent notice with each copy of the object code that the
|
|
75
|
+
Library is used in it and that the Library and its use are
|
|
76
|
+
covered by this License.
|
|
77
|
+
* **b)** Accompany the object code with a copy of the GNU GPL and this license
|
|
78
|
+
document.
|
|
79
|
+
|
|
80
|
+
### 4. Combined Works
|
|
81
|
+
|
|
82
|
+
You may convey a Combined Work under terms of your choice that,
|
|
83
|
+
taken together, effectively do not restrict modification of the
|
|
84
|
+
portions of the Library contained in the Combined Work and reverse
|
|
85
|
+
engineering for debugging such modifications, if you also do each of
|
|
86
|
+
the following:
|
|
87
|
+
|
|
88
|
+
* **a)** Give prominent notice with each copy of the Combined Work that
|
|
89
|
+
the Library is used in it and that the Library and its use are
|
|
90
|
+
covered by this License.
|
|
91
|
+
|
|
92
|
+
* **b)** Accompany the Combined Work with a copy of the GNU GPL and this license
|
|
93
|
+
document.
|
|
94
|
+
|
|
95
|
+
* **c)** For a Combined Work that displays copyright notices during
|
|
96
|
+
execution, include the copyright notice for the Library among
|
|
97
|
+
these notices, as well as a reference directing the user to the
|
|
98
|
+
copies of the GNU GPL and this license document.
|
|
99
|
+
|
|
100
|
+
* **d)** Do one of the following:
|
|
101
|
+
- **0)** Convey the Minimal Corresponding Source under the terms of this
|
|
102
|
+
License, and the Corresponding Application Code in a form
|
|
103
|
+
suitable for, and under terms that permit, the user to
|
|
104
|
+
recombine or relink the Application with a modified version of
|
|
105
|
+
the Linked Version to produce a modified Combined Work, in the
|
|
106
|
+
manner specified by section 6 of the GNU GPL for conveying
|
|
107
|
+
Corresponding Source.
|
|
108
|
+
- **1)** Use a suitable shared library mechanism for linking with the
|
|
109
|
+
Library. A suitable mechanism is one that **(a)** uses at run time
|
|
110
|
+
a copy of the Library already present on the user's computer
|
|
111
|
+
system, and **(b)** will operate properly with a modified version
|
|
112
|
+
of the Library that is interface-compatible with the Linked
|
|
113
|
+
Version.
|
|
114
|
+
|
|
115
|
+
* **e)** Provide Installation Information, but only if you would otherwise
|
|
116
|
+
be required to provide such information under section 6 of the
|
|
117
|
+
GNU GPL, and only to the extent that such information is
|
|
118
|
+
necessary to install and execute a modified version of the
|
|
119
|
+
Combined Work produced by recombining or relinking the
|
|
120
|
+
Application with a modified version of the Linked Version. (If
|
|
121
|
+
you use option **4d0**, the Installation Information must accompany
|
|
122
|
+
the Minimal Corresponding Source and Corresponding Application
|
|
123
|
+
Code. If you use option **4d1**, you must provide the Installation
|
|
124
|
+
Information in the manner specified by section 6 of the GNU GPL
|
|
125
|
+
for conveying Corresponding Source.)
|
|
126
|
+
|
|
127
|
+
### 5. Combined Libraries
|
|
128
|
+
|
|
129
|
+
You may place library facilities that are a work based on the
|
|
130
|
+
Library side by side in a single library together with other library
|
|
131
|
+
facilities that are not Applications and are not covered by this
|
|
132
|
+
License, and convey such a combined library under terms of your
|
|
133
|
+
choice, if you do both of the following:
|
|
134
|
+
|
|
135
|
+
* **a)** Accompany the combined library with a copy of the same work based
|
|
136
|
+
on the Library, uncombined with any other library facilities,
|
|
137
|
+
conveyed under the terms of this License.
|
|
138
|
+
* **b)** Give prominent notice with the combined library that part of it
|
|
139
|
+
is a work based on the Library, and explaining where to find the
|
|
140
|
+
accompanying uncombined form of the same work.
|
|
141
|
+
|
|
142
|
+
### 6. Revised Versions of the GNU Lesser General Public License
|
|
143
|
+
|
|
144
|
+
The Free Software Foundation may publish revised and/or new versions
|
|
145
|
+
of the GNU Lesser General Public License from time to time. Such new
|
|
146
|
+
versions will be similar in spirit to the present version, but may
|
|
147
|
+
differ in detail to address new problems or concerns.
|
|
148
|
+
|
|
149
|
+
Each version is given a distinguishing version number. If the
|
|
150
|
+
Library as you received it specifies that a certain numbered version
|
|
151
|
+
of the GNU Lesser General Public License “or any later version”
|
|
152
|
+
applies to it, you have the option of following the terms and
|
|
153
|
+
conditions either of that published version or of any later version
|
|
154
|
+
published by the Free Software Foundation. If the Library as you
|
|
155
|
+
received it does not specify a version number of the GNU Lesser
|
|
156
|
+
General Public License, you may choose any version of the GNU Lesser
|
|
157
|
+
General Public License ever published by the Free Software Foundation.
|
|
158
|
+
|
|
159
|
+
If the Library as you received it specifies that a proxy can decide
|
|
160
|
+
whether future versions of the GNU Lesser General Public License shall
|
|
161
|
+
apply, that proxy's public statement of acceptance of any version is
|
|
162
|
+
permanent authorization for you to choose that version for the
|
|
163
|
+
Library.
|
data/README.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
## Command line usage
|
|
2
|
+
|
|
3
|
+
Install the gem and see the help message by executing:
|
|
4
|
+
|
|
5
|
+
$ gem install precise
|
|
6
|
+
$ precise -h
|
|
7
|
+
|
|
8
|
+
Arabicise a string of Romanisation:
|
|
9
|
+
|
|
10
|
+
$ precise -T 'bi-smi llāhi al-raḥmani al-raḥīm' # -T removes Tashkeel
|
|
11
|
+
|
|
12
|
+
Romanise a string of Arabic (experimental):
|
|
13
|
+
|
|
14
|
+
$ precise 'بسم الله الرحمن الرحيم' # (not able to infer Tashkeel!)
|
|
15
|
+
|
|
16
|
+
## Usage inside of another application
|
|
17
|
+
|
|
18
|
+
Install the gem and add to the application's Gemfile by executing:
|
|
19
|
+
|
|
20
|
+
$ bundle add precise
|
|
21
|
+
$ bundle install
|
|
22
|
+
|
|
23
|
+
You can then access the API like so:
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
require 'precise'
|
|
27
|
+
Precise::Transcription.reverse 'bi-smi llāhi al-raḥmani al-raḥīm'
|
|
28
|
+
Precise::Transcription.transcribe 'ﺐﺴﻣ ﺎﻠﻠﻫ ﺎﻟﺮﺤﻤﻧ ﺎﻟﺮﺤﻴﻣ'
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Development
|
|
32
|
+
|
|
33
|
+
After checking out the repository, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
34
|
+
|
|
35
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
|
36
|
+
|
|
37
|
+
Issues and PRs are welcome!
|
|
38
|
+
|
|
39
|
+
## Funding
|
|
40
|
+
|
|
41
|
+
This Gem was developed within the long-term research project [Bibliotheca Arabica](http://www.bibliotheca-arabica.de) hosted at the Saxon Academy of the Sciences and Humanities in Leipzig, Germany. _Bibliotheca Arabica_ is part of the [German Academies’ Programme](https://www.akademienunion.de/en/research/the-academies-programme) and funded by the Federal Republic of Germany and the Free State of Saxony.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'bundler'
|
|
4
|
+
Bundler.require
|
|
5
|
+
|
|
6
|
+
require 'bundler/gem_tasks'
|
|
7
|
+
require 'rake/testtask'
|
|
8
|
+
|
|
9
|
+
Rake::TestTask.new(:test) do |t|
|
|
10
|
+
t.libs << 'test'
|
|
11
|
+
t.libs << 'lib'
|
|
12
|
+
t.test_files = FileList['test/**/test_*.rb']
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
desc 'increase minor version number by one'
|
|
16
|
+
task :bump do
|
|
17
|
+
current = Precise::VERSION
|
|
18
|
+
new = current.split('.')
|
|
19
|
+
new[-1] = (new[-1].to_i+1).to_s
|
|
20
|
+
new = new.join('.')
|
|
21
|
+
version_file = 'lib/precise/version.rb'
|
|
22
|
+
File.write(version_file, File.read(version_file).gsub(current, new))
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
task :default do; system 'rake -T'; end
|
data/TODO.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
Refactor to follow the following pattern:
|
|
2
|
+
|
|
3
|
+
```ruby
|
|
4
|
+
|
|
5
|
+
# short, romanised root <=> word list for learners: https://wahiduddin.net/words/arabic_glossary.htm
|
|
6
|
+
# commercial root <=> word dict: http://www.arabicroot.com/Home/Introduction
|
|
7
|
+
# possibly a good idea to OCR wehr 5 and make a dict from that?
|
|
8
|
+
|
|
9
|
+
def arabic_roots(opts); ['ʿwl','msʾ'].reject{|r| !r.include? opts[:with_letter]}.compact; end #
|
|
10
|
+
|
|
11
|
+
# 2005: http://jeffcoombs.com/isri/Taghva2005b.pdf
|
|
12
|
+
# 2006: NN-based: https://ieeexplore.ieee.org/document/4115547
|
|
13
|
+
# 2007: https://ieeexplore.ieee.org/document/4230974/
|
|
14
|
+
# 2014: https://journals.sagepub.com/doi/abs/10.1177/0165551514526348?journalCode=jisb
|
|
15
|
+
# 2016: https://www.sciencedirect.com/science/article/pii/S1319157815001342
|
|
16
|
+
# 2015: https://www.sciencedirect.com/science/article/pii/S1319157815000166
|
|
17
|
+
# metastudy (also 2015): https://www.sciencedirect.com/science/article/pii/S1319157815000166
|
|
18
|
+
# 2017: https://www.accentsjournals.org/PaperDirectory/Journal/IJACR/2018/3/3.pdf
|
|
19
|
+
# anything newer???
|
|
20
|
+
# some of the above testable at: http://arabic.emi.ac.ma:8080/SafarWeb/faces/safar/morphology/stemmer.xhtml
|
|
21
|
+
|
|
22
|
+
def extract_root(word); {'ʿāʾila':'ʿwl','masāʾikà':'msʾ'}[word.to_sym]; end
|
|
23
|
+
|
|
24
|
+
# with the above two in place:
|
|
25
|
+
|
|
26
|
+
arabic = %w[ʿāʾila masāʾikà].map{|s|
|
|
27
|
+
words = s.split ' '
|
|
28
|
+
words.map{|w|
|
|
29
|
+
w.gsub! /āʾi/, arabic_roots(with_letter: 'ʾ').include?(extract_root(w)) ? 'āSTANDALONE_HAMZAi' : 'āYA_AS_HAMZA_CARRIERi'
|
|
30
|
+
[
|
|
31
|
+
{'YA_AS_HAMZA_CARRIER':'ﺉ', 'STANDALONE_HAMZA':'ﺀ'},
|
|
32
|
+
{'ʿ':'ﻉ', 'ā':'ﺍ', 'i':'ِ◌', 'l':'ﻝ', 'a':'َ◌', 'm':'ﻡ', 's':'ﺱ', 'k':'ﻙ', 'à':'َ◌'}
|
|
33
|
+
].each{|list| list.each{|k,v| w.gsub! k.to_s, v}}
|
|
34
|
+
w.gsub! /◌$/, 'ﺓ'
|
|
35
|
+
}
|
|
36
|
+
words.join(' ').gsub('◌','')
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# use actual tests from current code instead; also generate more from existing known-good data!
|
|
40
|
+
|
|
41
|
+
tests = (arabic == ["ﻉﺎﺌِﻟَﺓ", "ﻢَﺳﺍﺀِﻙَﺓ"])
|
|
42
|
+
```
|
data/exe/precise
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
module CoreExtensions
|
|
2
|
+
refine String do
|
|
3
|
+
def precise_titlecase
|
|
4
|
+
s = chars
|
|
5
|
+
s.map.with_index{|c,i|
|
|
6
|
+
!%w[a i u].include?(s[0]) && ((i==0 && self[0..1] != 'al') || (i==1 && %w[ʾ ʿ].include?(s[0]))) ?
|
|
7
|
+
c.upcase :
|
|
8
|
+
c
|
|
9
|
+
}.join
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
refine Array do
|
|
14
|
+
def each_utf8_encode
|
|
15
|
+
map{|e| e.to_s.encode('utf-8')}
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
refine Hash do
|
|
20
|
+
def keys_and_values_to_s
|
|
21
|
+
map{|k,v| [k.to_s, v.class == Array ? v.map{|e| e.to_s} : v.to_s]}.to_h
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
unless self.respond_to?(:dbg); $dbg = 0; def dbg str; puts str if $dbg > 0; end; end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Precise
|
|
2
|
+
class Error < StandardError; end
|
|
3
|
+
|
|
4
|
+
class TranscriptionError < StandardError
|
|
5
|
+
def initialize(msg="unable to transcribe input string", exception_type=:untranscribable)
|
|
6
|
+
@exception_type = exception_type
|
|
7
|
+
super(msg)
|
|
8
|
+
end
|
|
9
|
+
attr_reader :exception_type
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class NotATranscriptionError < StandardError
|
|
13
|
+
def initialize(msg="input string is not (entirely) a romanisation of Arabic", exception_type=:untranscribable)
|
|
14
|
+
@exception_type = exception_type
|
|
15
|
+
super(msg)
|
|
16
|
+
end
|
|
17
|
+
attr_reader :exception_type
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
module Precise
|
|
2
|
+
class Transcription
|
|
3
|
+
def initialize(opts = {})
|
|
4
|
+
default_options = {punctuation: true, verbosity: 0}
|
|
5
|
+
@opts = default_options.merge(opts)
|
|
6
|
+
@opts[:verbosity] += 2 if @opts.delete(:verbose) == true
|
|
7
|
+
$dbg += @opts[:verbosity]
|
|
8
|
+
@out_chunks = []
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
module Precise
|
|
2
|
+
|
|
3
|
+
using CoreExtensions # the more generic ones
|
|
4
|
+
|
|
5
|
+
class Transcription
|
|
6
|
+
def transcription
|
|
7
|
+
@out_chunks
|
|
8
|
+
.map{|c| c
|
|
9
|
+
.gsub(/^m$/, 'mīlādī')
|
|
10
|
+
.gsub(/^h$/, 'hijrī')
|
|
11
|
+
.gsub(/^wāltī$/, 'wa-l-lātī')
|
|
12
|
+
.gsub(/^wālḏī$/, 'wa-l-lāḏī')
|
|
13
|
+
.gsub(/^hy$/, 'hiya')
|
|
14
|
+
.gsub(/^ʿlá$/, 'ʿalá')
|
|
15
|
+
.gsub(/^mn$/, 'min')
|
|
16
|
+
.gsub(/^yd$/, 'yad')
|
|
17
|
+
.gsub(/^fy$/, 'fī')
|
|
18
|
+
.gsub(/^lhā$/, 'lahā')}
|
|
19
|
+
.join(' ')
|
|
20
|
+
.gsub('؟','?')
|
|
21
|
+
.gsub('،',',')
|
|
22
|
+
.gsub(/\s+([[:punct:]]+)/,'\1')
|
|
23
|
+
.gsub(/(?!(\s+|^))\(\s+/, ' (')
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
A2R = A2RTable = {
|
|
27
|
+
"ال": "al-",
|
|
28
|
+
"ء": "ʾ",
|
|
29
|
+
"آ": "ʾā",
|
|
30
|
+
"أ": "ʾa",
|
|
31
|
+
"أُ": "ʾu",
|
|
32
|
+
"إ": "ʾi",
|
|
33
|
+
"ا": "ā",
|
|
34
|
+
"ب": "b",
|
|
35
|
+
"ة": "a",
|
|
36
|
+
"ت": "t",
|
|
37
|
+
"ث": "ṯ",
|
|
38
|
+
"ج": "ǧ",
|
|
39
|
+
"ح": "ḥ",
|
|
40
|
+
"خ": "ḫ",
|
|
41
|
+
"د": "d",
|
|
42
|
+
"ذ": "ḏ",
|
|
43
|
+
"ر": "r",
|
|
44
|
+
"ز": "z",
|
|
45
|
+
"س": "s",
|
|
46
|
+
"ش": "š",
|
|
47
|
+
"ص": "ṣ",
|
|
48
|
+
"ض": "ḍ",
|
|
49
|
+
"ط": "ṭ",
|
|
50
|
+
"ظ": "ẓ",
|
|
51
|
+
"ع": "ʿ",
|
|
52
|
+
"غ": "ġ",
|
|
53
|
+
"ف": "f",
|
|
54
|
+
"ق": "q",
|
|
55
|
+
"ك": "k",
|
|
56
|
+
"ل": "l",
|
|
57
|
+
"م": "m",
|
|
58
|
+
"ن": "n",
|
|
59
|
+
"ه": "h",
|
|
60
|
+
"و": ["ū", "w"],
|
|
61
|
+
"ى": "á",
|
|
62
|
+
"ي": ["ī", "y"],
|
|
63
|
+
"َ": "a",
|
|
64
|
+
"ُ": "u",
|
|
65
|
+
"ِ": "i",
|
|
66
|
+
"پ": "p",
|
|
67
|
+
"چ": "č",
|
|
68
|
+
"ژ": "ž",
|
|
69
|
+
"گ": "g",
|
|
70
|
+
"٠": "0",
|
|
71
|
+
"١": "1",
|
|
72
|
+
"٢": "2",
|
|
73
|
+
"٣": "3",
|
|
74
|
+
"٤": "4",
|
|
75
|
+
"٥": "5",
|
|
76
|
+
"٦": "6",
|
|
77
|
+
"٧": "7",
|
|
78
|
+
"٨": "8",
|
|
79
|
+
"٩": "9",
|
|
80
|
+
}.map{|k,v| [k.to_s, v]}.to_h
|
|
81
|
+
SHADDA=' ّ'.strip
|
|
82
|
+
|
|
83
|
+
def transcribe(arabic)
|
|
84
|
+
non_word_rgx = /([\s\d[:punct:]]+)/
|
|
85
|
+
in_chunks = arabic.split non_word_rgx
|
|
86
|
+
in_chunks.each.with_index do |chunk,i|
|
|
87
|
+
word = chunk
|
|
88
|
+
(next) if chunk.strip.empty?
|
|
89
|
+
(@out_chunks << chunk.strip; next) if chunk.match? non_word_rgx
|
|
90
|
+
chars = chunk.chars
|
|
91
|
+
skip = 0
|
|
92
|
+
(@out_chunks << '')
|
|
93
|
+
chars.each.with_index do |ch,j|
|
|
94
|
+
(skip-=1; next) if skip>0
|
|
95
|
+
(@out_chunks[-1] << A2R['ال']; skip+=1; next) if j==0 && word.match?(/^ال/)
|
|
96
|
+
out_char = nil
|
|
97
|
+
# و and ي:
|
|
98
|
+
# first in array is a long vowel,
|
|
99
|
+
# second in array is a consonant
|
|
100
|
+
if A2R[ch].class==Array
|
|
101
|
+
if j==0 || j+1==word.length
|
|
102
|
+
(@out_chunks[-1] << A2R[ch][-1]; next)
|
|
103
|
+
else
|
|
104
|
+
out_char = A2R[ch][0]
|
|
105
|
+
end
|
|
106
|
+
else
|
|
107
|
+
out_char = A2R[ch]
|
|
108
|
+
end
|
|
109
|
+
(@out_chunks[-1] << A2R[chars[j-1]]) if ch == SHADDA
|
|
110
|
+
(@out_chunks[-1] << out_char; next) if out_char
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def self.transcribe(arabic, opts={})
|
|
116
|
+
warn "Romanisation is incomplete.".yellow
|
|
117
|
+
warn "Consider adding short vowels by hand as needed.".yellow
|
|
118
|
+
obj = new(opts)
|
|
119
|
+
obj.transcribe(arabic)
|
|
120
|
+
return obj.transcription
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
module Precise
|
|
2
|
+
|
|
3
|
+
using CoreExtensions # the more generic ones
|
|
4
|
+
|
|
5
|
+
module CoreExtensions # the ones specific to this module
|
|
6
|
+
refine String do
|
|
7
|
+
# default output is "with everything"
|
|
8
|
+
# so once something is set to false, it'll be removed
|
|
9
|
+
def apply_options(opts)
|
|
10
|
+
defaults = {punctuation: true, tashkeel: true, alif_variants: true}
|
|
11
|
+
opts = defaults.merge opts
|
|
12
|
+
s = self.dup
|
|
13
|
+
|
|
14
|
+
if !opts[:punctuation]
|
|
15
|
+
s = s.gsub(/[[:punct:]]+/,'')
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
if !opts[:tashkeel]
|
|
19
|
+
tashkeel = Precise::Transcription::Tashkeel
|
|
20
|
+
nonprintables = Precise::Transcription::Nonprintables
|
|
21
|
+
extraneous_chars = [tashkeel + nonprintables].join
|
|
22
|
+
s = s.gsub(/[#{extraneous_chars}]/,'')
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
if !opts[:alif_variants]
|
|
26
|
+
alif_variants = Precise::Transcription::AlifVariants
|
|
27
|
+
s = s.gsub(/[#{alif_variants}]/,'ا')
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
return s.strip
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
class Transcription
|
|
36
|
+
using Precise::CoreExtensions
|
|
37
|
+
|
|
38
|
+
# Ruby would have been fine with these in the file verbatim (on their own),
|
|
39
|
+
# alas, my editor's syntax highlighting can't cope, so doing it 1990s-style
|
|
40
|
+
Fatha, Kasra, Damma, Shadda = ["\u064e", "\u0650", "\u064f", "\u0651"].each_utf8_encode
|
|
41
|
+
# nonprintables
|
|
42
|
+
R2LM, L2RM, ZWNJ = ["\u200f", "\u200e", "\u200c"].each_utf8_encode
|
|
43
|
+
# typographic modifiers, ligatures, oft-used words
|
|
44
|
+
Tatweel, Allah = ["ـ", "الله"]
|
|
45
|
+
# the various forms of alif, ya and waw
|
|
46
|
+
AlifVariants = ['أ', 'إ', 'آ', 'ا', 'ٱ']
|
|
47
|
+
AlifHamzaAbove, AlifHamzaBelow, AlifMadda, Alif, AlifWasla = AlifVariants
|
|
48
|
+
YaHamzaAbove, Ya = ['ئ', 'ي']
|
|
49
|
+
WawHamzaAbove, Waw = ['ؤ', 'و']
|
|
50
|
+
# other character lists
|
|
51
|
+
Tashkeel = ("064B".to_i(16).."065B".to_i(16)).map{|dec| hex=("%04x" % dec); eval("char=\"\\u#{hex}\"; char")}
|
|
52
|
+
Nonprintables = [R2LM, L2RM]
|
|
53
|
+
|
|
54
|
+
R2ATables = {
|
|
55
|
+
# Adapted from the Transcription in the Brill PDF's "Note to the Indices":
|
|
56
|
+
# - a dash, depending on its position, denotes the start or end of the word
|
|
57
|
+
# - an array denotes the requirement for a choice to be made from context
|
|
58
|
+
# - any characters that are being replaced by DMG characters have been ommitted
|
|
59
|
+
common: {
|
|
60
|
+
ʾ: :ء,
|
|
61
|
+
b: :ب,
|
|
62
|
+
p: :پ,
|
|
63
|
+
t: :ت,
|
|
64
|
+
ḥ: :ح,
|
|
65
|
+
d: :د,
|
|
66
|
+
r: :ر,
|
|
67
|
+
z: :ز,
|
|
68
|
+
s: :س,
|
|
69
|
+
ṣ: :ص,
|
|
70
|
+
ḍ: :ض,
|
|
71
|
+
ṭ: :ط,
|
|
72
|
+
ẓ: :ظ,
|
|
73
|
+
ʿ: :ع,
|
|
74
|
+
f: :ف,
|
|
75
|
+
q: :ق,
|
|
76
|
+
k: :ك,
|
|
77
|
+
g: :گ,
|
|
78
|
+
l: :ل,
|
|
79
|
+
m: :م,
|
|
80
|
+
n: :ن,
|
|
81
|
+
h: :ه,
|
|
82
|
+
w: :و,
|
|
83
|
+
y: :ي,
|
|
84
|
+
ā: :ا,
|
|
85
|
+
ū: :و,
|
|
86
|
+
ī: :ي,
|
|
87
|
+
},
|
|
88
|
+
vowels: {
|
|
89
|
+
a: Fatha,
|
|
90
|
+
à: Fatha, # at word-end only
|
|
91
|
+
u: Damma,
|
|
92
|
+
i: Kasra,
|
|
93
|
+
},
|
|
94
|
+
combos: {
|
|
95
|
+
aw: :َو,
|
|
96
|
+
ay: :َي
|
|
97
|
+
},
|
|
98
|
+
brockelmann: {
|
|
99
|
+
'-a': :ة, # "-" = at word-end
|
|
100
|
+
'-at': :ة, # "-" = at word-end
|
|
101
|
+
'al-': :ال, # "-" = at word-start
|
|
102
|
+
},
|
|
103
|
+
dmg: {
|
|
104
|
+
ṯ: :ث,
|
|
105
|
+
ǧ: :ج,
|
|
106
|
+
č: :چ,
|
|
107
|
+
ḫ: :خ,
|
|
108
|
+
ḏ: :ذ,
|
|
109
|
+
ž: :ژ,
|
|
110
|
+
š: :ش,
|
|
111
|
+
ġ: :غ
|
|
112
|
+
},
|
|
113
|
+
uppercase: {
|
|
114
|
+
A: :أَ,
|
|
115
|
+
I: :إِ,
|
|
116
|
+
U: :أُ,
|
|
117
|
+
Y: :ي
|
|
118
|
+
},
|
|
119
|
+
farsi: {
|
|
120
|
+
v: :و, # always? what, e.g. about "Divbandi"?
|
|
121
|
+
e: [:ه, Fatha] # word-end, mid-word
|
|
122
|
+
},
|
|
123
|
+
turkic: {
|
|
124
|
+
ö: :و,
|
|
125
|
+
ü: Damma, # ???
|
|
126
|
+
ı: Kasra, # ???
|
|
127
|
+
E: :ا
|
|
128
|
+
},
|
|
129
|
+
indic: {
|
|
130
|
+
ō: :و # things like "Bh" => "بْ" would go here, too
|
|
131
|
+
},
|
|
132
|
+
romanic: {
|
|
133
|
+
c: :ث, # or should this rather be a س?
|
|
134
|
+
o: :و,
|
|
135
|
+
Ė: :إي,
|
|
136
|
+
x: :كس
|
|
137
|
+
},
|
|
138
|
+
semitic: {
|
|
139
|
+
ē: :ﺍ # is that always so?
|
|
140
|
+
},
|
|
141
|
+
finnic: {
|
|
142
|
+
ä: Fatha # in e.g. Mänglī
|
|
143
|
+
},
|
|
144
|
+
precise: {
|
|
145
|
+
á: :ى,
|
|
146
|
+
Ā: :آ, # don't add 'ʾĀ' here - it is considered an error in the input!
|
|
147
|
+
'ʾā': :آ # same but lowercase - alif madda in the middle of the word
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
PostR2AWordReplacements = {
|
|
152
|
+
/^(.*)لّاه/ => '\1 الله', # names ending in "allah"
|
|
153
|
+
/(ب\.|إبن|إِبن)/ => 'بن', # "son of"
|
|
154
|
+
/أَبي/ => 'أبي', # "father of" (gen.)
|
|
155
|
+
/أَبو/ => 'أبو', # "father of" (nom.)
|
|
156
|
+
/بَكر/ => 'بكر', # the name "bakr"
|
|
157
|
+
/عَلي/ => 'علي', # the name "ali"
|
|
158
|
+
/عَبد/ => 'عبد', # the name-part "abd"
|
|
159
|
+
/افندي/ => 'افندی' # ottoman/turkish effendi
|
|
160
|
+
# /([یي]زاده$)/ => ZWNJ+'ی'+ZWNJ+'زاده', # names ending in "-azade" # removed at DK's request
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
PostR2AContextReplacements = {
|
|
164
|
+
/((^|\.\s+)بن(\s+))/ => 'ابن\3', # exception: son-of in beginning of sentence
|
|
165
|
+
/(تِ|تُ|تَ)(\s+)/ => 'ة ', # this'll lose the case ending, but that's for the better
|
|
166
|
+
/داوود/ => 'داود' # not sure if this might actually hold true for all ...wū...?
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
PunctSepRgx = /[ \.\-\(\)\?\&=,;:]/
|
|
170
|
+
|
|
171
|
+
R2A = R2ATables.values.inject(:merge) # just one level is enough now
|
|
172
|
+
.keys_and_values_to_s # more convenient to work with
|
|
173
|
+
|
|
174
|
+
SunLetters = %w[t ṯ d ḏ r z s š ṣ ḍ ṭ ẓ l n]
|
|
175
|
+
RomanizedShortVowels = %w[a i u]
|
|
176
|
+
RomanizedLongVowels = %w[ā ū ī]
|
|
177
|
+
# "a" here because of ta'marbouta, "á" because of alif maqsoura, "ā" because of word-final alif mamdouda
|
|
178
|
+
RomanizedConsonantals = SunLetters + %w[m l k q f ġ ʿ ḫ ḥ h ǧ b ʾ a á]
|
|
179
|
+
ArabicScriptVowels = %w[ا ي و]
|
|
180
|
+
ArabicScriptConsonants = %w[ا ب ت ث ج ح خ س ش ص ض ط ظ ع غ ف ق ك ل م ن ه ي ئ ة ى أ إ ؤ ئ آ]
|
|
181
|
+
|
|
182
|
+
LatinChars = R2A.map{|l,a| l unless l.size != 1}.compact
|
|
183
|
+
TranslitChars_lowercase = 'ʾʿḏḥṣḍṭẓāūīṯǧčḫžšġōĖēáäüöü'
|
|
184
|
+
TranslitChars = (TranslitChars_lowercase + TranslitChars_lowercase.upcase).chars.uniq.join
|
|
185
|
+
|
|
186
|
+
def this_word(str, idx)
|
|
187
|
+
str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{TranslitChars}\w]+/] || '')
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def this_word_and_the_next(str, idx)
|
|
191
|
+
# first part: from beginning of string to index position, get all non-whitespace characters
|
|
192
|
+
# second part: from index position to end of string,
|
|
193
|
+
# get all characters belonging to the word which the index position character belongs to,
|
|
194
|
+
# as well as the next word if any
|
|
195
|
+
if str.match?(/\s+/)
|
|
196
|
+
str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{@translit_chars}\w]+\s+[#{@translit_chars}\w]+/i] || '')
|
|
197
|
+
else
|
|
198
|
+
str
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def hamza_before_following(ch, pch, first_letter_of_word = false)
|
|
203
|
+
if first_letter_of_word
|
|
204
|
+
case ch.to_sym
|
|
205
|
+
when :a, :u then AlifHamzaAbove
|
|
206
|
+
when :i then AlifHamzaBelow
|
|
207
|
+
when :ā then AlifMadda
|
|
208
|
+
when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
|
|
209
|
+
when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
|
|
210
|
+
end
|
|
211
|
+
else
|
|
212
|
+
if %w[y ī].include? pch
|
|
213
|
+
# also take into account what PRECEDED the hamza - that might take precedence!
|
|
214
|
+
case ch.to_sym
|
|
215
|
+
when :a then YaHamzaAbove
|
|
216
|
+
when :i then YaHamzaAbove
|
|
217
|
+
when :u then WawHamzaAbove
|
|
218
|
+
when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
|
|
219
|
+
when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
|
|
220
|
+
end
|
|
221
|
+
else
|
|
222
|
+
case ch.to_sym
|
|
223
|
+
when :a then AlifHamzaAbove
|
|
224
|
+
when :i then YaHamzaAbove
|
|
225
|
+
when :u then
|
|
226
|
+
pch == 'ū' ? R2A['ʾ'] : WawHamzaAbove
|
|
227
|
+
when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
|
|
228
|
+
when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def hamza_after_preceding(ch, first_letter_of_word = false)
|
|
235
|
+
if first_letter_of_word
|
|
236
|
+
case ch.to_sym
|
|
237
|
+
when :a then AlifHamzaAbove
|
|
238
|
+
when :u then R2A['ā']+Damma+WawHamzaAbove
|
|
239
|
+
when :i then R2A['ā']+YaHamzaAbove
|
|
240
|
+
end
|
|
241
|
+
else
|
|
242
|
+
case ch.to_sym
|
|
243
|
+
when :a then AlifHamzaAbove
|
|
244
|
+
when :i then YaHamzaAbove
|
|
245
|
+
when :u then WawHamzaAbove
|
|
246
|
+
when :ī then YaHamzaAbove
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def alif_for_word_initial_kasra(word)
|
|
252
|
+
# a,i,u = that specific short vowel
|
|
253
|
+
# c = any consonantal
|
|
254
|
+
# s = any short vowel
|
|
255
|
+
# l = any long vowel
|
|
256
|
+
patterns = [
|
|
257
|
+
'iCClC',
|
|
258
|
+
'iCCiCClC',
|
|
259
|
+
'iClCC'
|
|
260
|
+
]
|
|
261
|
+
# pp word
|
|
262
|
+
shorts = RomanizedShortVowels
|
|
263
|
+
longs = RomanizedLongVowels
|
|
264
|
+
consonants = RomanizedConsonantals
|
|
265
|
+
alif = Alif
|
|
266
|
+
patterns.each do |p|
|
|
267
|
+
# puts "> #{p}"
|
|
268
|
+
next unless word.size == p.size
|
|
269
|
+
match = true
|
|
270
|
+
word.chars.each_with_index do |c,i|
|
|
271
|
+
case p[i]
|
|
272
|
+
when 'C' then match = false unless consonants.include?(c)
|
|
273
|
+
when 's' then match = false unless shorts.include?(c)
|
|
274
|
+
when 'l' then match = false unless longs.include?(c)
|
|
275
|
+
else
|
|
276
|
+
match = false unless c == p[i]
|
|
277
|
+
end
|
|
278
|
+
# puts "after #{c}: #{match} (should have been #{p[i]})"
|
|
279
|
+
end
|
|
280
|
+
(match = false if word.downcase.match?(/^ist/)) # استـ introduces
|
|
281
|
+
(alif = AlifHamzaBelow; break) if match
|
|
282
|
+
end; puts "\t\tfor #{word}: word-initial #{alif}".light_blue if $dbg > 1
|
|
283
|
+
alif
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def sanitize(str)
|
|
287
|
+
# remove nonprintables such as the ZWNJ
|
|
288
|
+
# FIXME: the erroneous_chars replacement table should have already taken care of this?!
|
|
289
|
+
["\u200c", "\u200f"].each{|ch| str.gsub! ch, ''}
|
|
290
|
+
# make letters following either ʿ or ʾ lowercase
|
|
291
|
+
lastc=''; str.chars.map{|c| c.downcase! if lastc.match?(/[ʿʾ]/); lastc=c}.join
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# input: valid Precise string
|
|
295
|
+
# example: (al-)ʿAbbādī Muḥammad Ibn Aḥmad Ibn Muḥammad al-Harawī
|
|
296
|
+
# output: Arabic string
|
|
297
|
+
# example: العَبّادي مُحَمَّد بن أَحمَد بن مُحَمَّد الهَرَوي
|
|
298
|
+
def reverse(romanized)
|
|
299
|
+
raise Precise::NotATranscriptionError if romanized.nil?
|
|
300
|
+
|
|
301
|
+
# sure, it's called "Precise", but it should still be
|
|
302
|
+
# as tolerant as possible in what it accepts as input...
|
|
303
|
+
romanized = sanitize(romanized)
|
|
304
|
+
arabic = '' # we start with an empty string and go character by character
|
|
305
|
+
|
|
306
|
+
puts "- (#{romanized.size}) [#{romanized}]".light_green if $dbg > 1
|
|
307
|
+
|
|
308
|
+
# next, turn strings into character arrays
|
|
309
|
+
romanized = romanized.chars
|
|
310
|
+
arabic = arabic.chars
|
|
311
|
+
# to be able to merge 2 romanized characters into 1 arabic character
|
|
312
|
+
skip = false
|
|
313
|
+
# print string like so: ʿ·A·b·b·ā·d·ī· ·M·u·ḥ·a·m·m·a·d· ·I·b·n· ·A·ḥ·m·a·d· ·I·b·n· ...
|
|
314
|
+
puts "- (#{romanized.size}) [#{romanized.join('·')}]".light_green if $dbg > 1
|
|
315
|
+
|
|
316
|
+
# loop over the romanized character array, filling the arabic one up as we go
|
|
317
|
+
romanized.each_with_index do |ch,i|
|
|
318
|
+
# a little bit of context
|
|
319
|
+
pch = i == 0 ? nil : romanized[i-1]
|
|
320
|
+
fch = romanized[i+1]
|
|
321
|
+
ffch = romanized[i+2]
|
|
322
|
+
|
|
323
|
+
# multi-letter skip-aheads
|
|
324
|
+
if skip
|
|
325
|
+
dbg "\t\tskipping #{ch}"
|
|
326
|
+
if !(pch=='a' && fch=='-') # we're in the middle of "al-" (word-start)
|
|
327
|
+
skip=false; end; next; end
|
|
328
|
+
|
|
329
|
+
# symbols to remove from input
|
|
330
|
+
(dbg "\tskipping unprintable symbol"; next) if [ZWNJ].include?(ch)
|
|
331
|
+
|
|
332
|
+
# deal with alif madda before "normal" hamza rules follow
|
|
333
|
+
if ("#{ch}#{fch}".match?(/ʾā/) || "#{pch}#{ch}".match?(/^Ā/))
|
|
334
|
+
(dbg "\talif madda #{R2A['ʾā']}"; arabic << R2A['ʾā']; skip=true; next); end
|
|
335
|
+
|
|
336
|
+
# hamza followed by a short or long vowel
|
|
337
|
+
if ch == 'ʾ' && %w[a i u ā ī ū].include?(fch.to_s.downcase)
|
|
338
|
+
is_first_letter_of_word = (pch.nil? || pch.match(/\s+/))
|
|
339
|
+
(dbg "\t#{ch} with following #{fch}";
|
|
340
|
+
arabic << hamza_before_following(fch, pch, is_first_letter_of_word);
|
|
341
|
+
skip=true unless this_word(romanized.join, i).match?(/(a$|at($|\s))/)
|
|
342
|
+
next); end
|
|
343
|
+
# hamza preceded by a short vowel
|
|
344
|
+
# (beware of a possible alif madda (would be dealt with above, on the next round))
|
|
345
|
+
if fch.to_s == 'ʾ' && !ffch.to_s.match?(/[āĀ]/) && %w[a i u].include?(ch.downcase)
|
|
346
|
+
is_first_letter_of_word = (pch.nil? || pch.match(/\s+/))
|
|
347
|
+
(dbg "\t#{fch} carried on or following preceding #{ch}"
|
|
348
|
+
arabic << hamza_after_preceding(ch, is_first_letter_of_word); skip=true; next); end
|
|
349
|
+
|
|
350
|
+
# find the article "al", marked by having a dash appended to it
|
|
351
|
+
(dbg "\tarticle al- #{R2A['al-']}"; arabic << R2A['al-']; skip=true; next) if ("#{ch}#{fch}#{ffch}" == 'al-')
|
|
352
|
+
|
|
353
|
+
# unconditionally add spaces, dots and dashes to the output
|
|
354
|
+
(dbg "\tinitial only (#{pch}#{ch})"; arabic << ch; next) if ch=='.' && (fch.nil? || fch.match(/\s+/))
|
|
355
|
+
(dbg "\tnon-letter (#{ch})"; arabic << ch; next) if ch.match(PunctSepRgx) # white space or punctuation
|
|
356
|
+
|
|
357
|
+
# a word-initial "a" or "u" must always be preceded by "ʾ"; only "i" can possibly *not* have one
|
|
358
|
+
|
|
359
|
+
# deal with word-initial special cases
|
|
360
|
+
if pch.to_s.strip.empty? # either beginning of string or of word
|
|
361
|
+
if %w[a u].include?(ch)
|
|
362
|
+
(dbg "\tprepending #{ch} with hamza"; arabic << R2A[ch.upcase]; next); end
|
|
363
|
+
if ch == 'i'
|
|
364
|
+
(dbg "\thamza-less alif?"
|
|
365
|
+
context = this_word(romanized.join, i)
|
|
366
|
+
arabic << alif_for_word_initial_kasra(context.split(/^w?al-/).last)
|
|
367
|
+
next); end; end
|
|
368
|
+
|
|
369
|
+
# perform tashdeed
|
|
370
|
+
(out=R2A[ch]+Shadda; dbg "\ttashdeed of #{ch} #{out}"; arabic << out; skip = true; next) if R2A[ch] && ch==fch
|
|
371
|
+
|
|
372
|
+
# should there be a ta'marbouta or not at the end of the word?
|
|
373
|
+
context1 = this_word(romanized.join,i)
|
|
374
|
+
context2 = this_word_and_the_next(romanized.join,i)
|
|
375
|
+
if context1 == context2 # single word
|
|
376
|
+
if (i == context1.length-2 && "#{ch}#{fch}".match?(/at$/)) \
|
|
377
|
+
|| (i == context1.length-1 && "#{ch}#{fch}".match?(/a$/))
|
|
378
|
+
arabic << R2A['-at']+' '; skip=true; next
|
|
379
|
+
end
|
|
380
|
+
else # multiple words
|
|
381
|
+
if (i == context1.length-2 && "#{ch}#{fch}#{ffch}".match?(/at\s/))
|
|
382
|
+
arabic << R2A['-a']+' '; skip = true; next
|
|
383
|
+
elsif (i == context1.length-1 && "#{ch}#{fch}".match?(/a\s/))
|
|
384
|
+
arabic << R2A['-a']+' '; next
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
# letter ayn followed by uppercase vowel
|
|
389
|
+
if ch == 'ʿ'
|
|
390
|
+
(skip=true; ar=R2A[ch]) if %w[A I U].include?(fch)
|
|
391
|
+
case fch # ayn+following vowel at beginning of word
|
|
392
|
+
when 'A' then ar+=Fatha
|
|
393
|
+
when 'I' then ar+=Kasra
|
|
394
|
+
when 'U' then ar+=Damma; end; end
|
|
395
|
+
(dbg "\tayn+vowel #{ch}#{fch} #{ar}"; arabic << ar; next) if ar && ar.size==2
|
|
396
|
+
|
|
397
|
+
# long "a" at word-end: alif maqsoorah, otherwise normal alif
|
|
398
|
+
# "e" at word-end: letter hah, otherwise just a fatha
|
|
399
|
+
if R2A[ch].class == Array
|
|
400
|
+
choice = (fch.nil? || fch==' ') ? R2A[ch].first : R2A[ch].last
|
|
401
|
+
(dbg "\tcontextual #{ch} #{choice}"; arabic << choice; next); end
|
|
402
|
+
|
|
403
|
+
# exact match (pure transliteration, no transcription effort required)
|
|
404
|
+
(dbg "\tfrom table #{ch}→#{R2A[ch]}"; arabic << R2A[ch]; next) if R2A[ch]
|
|
405
|
+
|
|
406
|
+
# no luck yet; might be a regular uppercase letter
|
|
407
|
+
(dbg "\tuppercased #{ch} #{R2A[ch.downcase]}"; arabic << R2A[ch.downcase]; next) if R2A[ch.downcase]
|
|
408
|
+
|
|
409
|
+
# still no luck; last shot is punctuation
|
|
410
|
+
(dbg "\tinterpunctuation #{ch}"; arabic << ch; next) if ch.match?(/[[:punct:]]/)
|
|
411
|
+
|
|
412
|
+
# mark unknown characters as such; the philosophy here being that input to
|
|
413
|
+
# Precise should be pre-processed enough for this to never have to happen…
|
|
414
|
+
warn "Warning: character '#{ch}' is unknown to Precise and will be substituted by placeholder only".yellow
|
|
415
|
+
arabic << '�'
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
# character-array to word-array
|
|
419
|
+
arabic = arabic.compact.join.split
|
|
420
|
+
# العأَبّادي محمّد إِبن أَحمد إِبن محمّد للهروي (but with () around "al")
|
|
421
|
+
puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1
|
|
422
|
+
|
|
423
|
+
# dragnet replacement of special words, such as changing "ibn" into "bin"
|
|
424
|
+
2.times.each_with_index do |i|
|
|
425
|
+
puts "#{' '*6}(postprocessing round #{i+1})".light_green if $dbg > 1
|
|
426
|
+
PostR2AWordReplacements.each{|rgx,subst|
|
|
427
|
+
arabic.map!{|w|
|
|
428
|
+
puts "#{' '*8}word match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (w.match(rgx) && $dbg > 1)
|
|
429
|
+
w.gsub(/-/, '') # dashes not needed anymore now
|
|
430
|
+
.gsub(rgx, subst)} }
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
# some rules apply only in the context of words, not letters
|
|
434
|
+
puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1
|
|
435
|
+
arabic = arabic.join(' ')
|
|
436
|
+
PostR2AContextReplacements.each{|rgx,subst|
|
|
437
|
+
puts "#{' '*8}context match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (arabic.match(rgx) && $dbg > 1)
|
|
438
|
+
arabic.gsub!(rgx, subst) }
|
|
439
|
+
|
|
440
|
+
return arabic.apply_options(@opts)
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
def self.reverse(romanized, opts={})
|
|
444
|
+
new(opts).reverse(romanized)
|
|
445
|
+
end
|
|
446
|
+
end
|
|
447
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
require 'fileutils'
|
|
2
|
+
|
|
3
|
+
module Precise
|
|
4
|
+
using CoreExtensions
|
|
5
|
+
|
|
6
|
+
class TypesList
|
|
7
|
+
@@types = nil
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
resdir = File.join __dir__,'..','..','res'
|
|
11
|
+
FileUtils.mkdir_p resdir
|
|
12
|
+
typesfile = File.absolute_path(File.join resdir,'types.lst')
|
|
13
|
+
download(typesfile) unless File.exist? typesfile
|
|
14
|
+
@types ||= File.readlines typesfile, chomp: true
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def download(path)
|
|
18
|
+
puts 'downloading types database (only needed once)...'
|
|
19
|
+
require 'net/http'
|
|
20
|
+
require 'open-uri'
|
|
21
|
+
require 'progressbar'
|
|
22
|
+
url = 'https://raw.githubusercontent.com/sixtyfive/arabic-types/main/types.lst'
|
|
23
|
+
data = URI.open(url)
|
|
24
|
+
IO.copy_stream data, path
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def percentage_of_tokens_present(string)
|
|
28
|
+
words = string.split
|
|
29
|
+
n_present = words.map{|w| @types.include? w}.count(true)
|
|
30
|
+
100.0 / words.length * n_present
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.percentage_of_tokens_present(string)
|
|
34
|
+
new.percentage_of_tokens_present(string)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
data/lib/precise.rb
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
require 'pp'
|
|
2
|
+
require 'slop'
|
|
3
|
+
require 'yaml'
|
|
4
|
+
require 'tiny_color'
|
|
5
|
+
|
|
6
|
+
deps = %w[version debugging error_classes core_extensions transcription transcription_r2a transcription_a2r types_list]
|
|
7
|
+
deps.each{|d| require_relative File.join(__dir__,'..','lib','precise',d)}
|
|
8
|
+
|
|
9
|
+
module Precise
|
|
10
|
+
class CLI
|
|
11
|
+
def initialize
|
|
12
|
+
opts = Slop::Options.new
|
|
13
|
+
opts.banner = "Usage: precise [options] <string(s)>\n"
|
|
14
|
+
opts.separator " where options can be:\n"
|
|
15
|
+
alif_variants = Precise::Transcription::AlifVariants
|
|
16
|
+
opts.bool "-s", "--show-rules", "print the list of rules which are applied for transcription"
|
|
17
|
+
opts.bool "-c", "--confidence", "also print the percentage of output words appearing in a large corpus of Arabic"
|
|
18
|
+
opts.bool "-A", "--no-alif-variants", "all of #{alif_variants.join("، ")} will be merged into ا"
|
|
19
|
+
opts.bool "-T", "--no-tashkeel", "diacritics (and non printables, such as tatweel) will be removed from output"
|
|
20
|
+
opts.bool "-P", "--no-punctuation", "all punctuation characters will be discarded from output"
|
|
21
|
+
opts.bool "-v", "--verbose", "instruct the backend classes to output debugging and plausibility information"
|
|
22
|
+
opts.bool "-h", "--help", "display this message"
|
|
23
|
+
opts.separator "\n Transcription direction is determined by presence of characters from the 'Arabic' Unicode block.\n" \
|
|
24
|
+
" At present, Arabic-to-Roman transcription is only rudimentary."
|
|
25
|
+
opts = Slop::Parser.new(opts)
|
|
26
|
+
|
|
27
|
+
begin
|
|
28
|
+
@opts = opts.parse(ARGV)
|
|
29
|
+
usage if @opts[:help] || ARGV.size == 0
|
|
30
|
+
rules if @opts.to_h[:show_rules]
|
|
31
|
+
rescue
|
|
32
|
+
@opts = opts.parse([])
|
|
33
|
+
usage
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
options = {verbose: @opts[:verbose]}
|
|
37
|
+
options[:alif_variants] = false if @opts.to_h[:no_alif_variants]
|
|
38
|
+
options[:tashkeel] = false if @opts.to_h[:no_tashkeel]
|
|
39
|
+
options[:punctuation] = false if @opts.to_h[:no_punctuation]
|
|
40
|
+
|
|
41
|
+
instr = @opts.arguments.join(' ')
|
|
42
|
+
if instr.match?(/\p{Arabic}/)
|
|
43
|
+
outstr = Precise::Transcription.transcribe(instr.dup, options)
|
|
44
|
+
else
|
|
45
|
+
outstr = Precise::Transcription.reverse(instr.dup, options)
|
|
46
|
+
outstr += " (#{Precise::TypesList::percentage_of_tokens_present(outstr)}%)" if @opts[:confidence]
|
|
47
|
+
end
|
|
48
|
+
puts outstr.pretty_inspect.gsub(/(^"|"$)/, "").strip
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def usage
|
|
52
|
+
warn @opts
|
|
53
|
+
exit
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def rules
|
|
57
|
+
puts Precise::Transcription::R2ATables.map{|k,v| Hash[k.to_s,v.map{|kk,vv| Hash[kk.to_s,vv]}]}.to_yaml.gsub(/---\n/,'')
|
|
58
|
+
exit
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def nopts
|
|
62
|
+
@opts.to_h.values.map { |o| o || nil }.compact.size
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def self.start; new; end
|
|
66
|
+
end
|
|
67
|
+
end
|
data/precise.gemspec
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/precise/version'
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = 'precise'
|
|
7
|
+
spec.version = Precise::VERSION
|
|
8
|
+
spec.authors = ['J. R. Schmid']
|
|
9
|
+
spec.email = ['jrs+git@weitnahbei.de']
|
|
10
|
+
|
|
11
|
+
spec.summary = 'Arabic to DMG-like (but more precise) and back'
|
|
12
|
+
spec.description = 'Romanise Arabic script, arabicise romanisations of Arabic script back into Latin script '
|
|
13
|
+
spec.homepage = 'https://rubygems.org/gems/precise'
|
|
14
|
+
spec.required_ruby_version = '>= 2.7.0'
|
|
15
|
+
|
|
16
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
|
17
|
+
spec.metadata['source_code_uri'] = 'https://github.com/sixtyfive/precise.git'
|
|
18
|
+
|
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
21
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
|
22
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
23
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
spec.bindir = 'exe'
|
|
27
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
28
|
+
spec.require_paths = ['lib']
|
|
29
|
+
|
|
30
|
+
# dependencies
|
|
31
|
+
|
|
32
|
+
spec.add_dependency 'slop'
|
|
33
|
+
spec.add_dependency 'tiny_color'
|
|
34
|
+
spec.add_dependency 'progressbar'
|
|
35
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: precise
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.8
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- J. R. Schmid
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2023-03-14 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: slop
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: tiny_color
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: progressbar
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
description: 'Romanise Arabic script, arabicise romanisations of Arabic script back
|
|
56
|
+
into Latin script '
|
|
57
|
+
email:
|
|
58
|
+
- jrs+git@weitnahbei.de
|
|
59
|
+
executables:
|
|
60
|
+
- precise
|
|
61
|
+
extensions: []
|
|
62
|
+
extra_rdoc_files: []
|
|
63
|
+
files:
|
|
64
|
+
- Gemfile
|
|
65
|
+
- Gemfile.lock
|
|
66
|
+
- LICENSE.md
|
|
67
|
+
- README.md
|
|
68
|
+
- Rakefile
|
|
69
|
+
- TODO.md
|
|
70
|
+
- exe/precise
|
|
71
|
+
- lib/precise.rb
|
|
72
|
+
- lib/precise/core_extensions.rb
|
|
73
|
+
- lib/precise/debugging.rb
|
|
74
|
+
- lib/precise/error_classes.rb
|
|
75
|
+
- lib/precise/transcription.rb
|
|
76
|
+
- lib/precise/transcription_a2r.rb
|
|
77
|
+
- lib/precise/transcription_r2a.rb
|
|
78
|
+
- lib/precise/types_list.rb
|
|
79
|
+
- lib/precise/version.rb
|
|
80
|
+
- precise.gemspec
|
|
81
|
+
homepage: https://rubygems.org/gems/precise
|
|
82
|
+
licenses: []
|
|
83
|
+
metadata:
|
|
84
|
+
homepage_uri: https://rubygems.org/gems/precise
|
|
85
|
+
source_code_uri: https://github.com/sixtyfive/precise.git
|
|
86
|
+
post_install_message:
|
|
87
|
+
rdoc_options: []
|
|
88
|
+
require_paths:
|
|
89
|
+
- lib
|
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
91
|
+
requirements:
|
|
92
|
+
- - ">="
|
|
93
|
+
- !ruby/object:Gem::Version
|
|
94
|
+
version: 2.7.0
|
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
96
|
+
requirements:
|
|
97
|
+
- - ">="
|
|
98
|
+
- !ruby/object:Gem::Version
|
|
99
|
+
version: '0'
|
|
100
|
+
requirements: []
|
|
101
|
+
rubygems_version: 3.3.25
|
|
102
|
+
signing_key:
|
|
103
|
+
specification_version: 4
|
|
104
|
+
summary: Arabic to DMG-like (but more precise) and back
|
|
105
|
+
test_files: []
|