precise 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +64 -0
- data/LICENSE.md +163 -0
- data/README.md +41 -0
- data/Rakefile +25 -0
- data/TODO.md +42 -0
- data/exe/precise +10 -0
- data/lib/precise/core_extensions.rb +24 -0
- data/lib/precise/debugging.rb +1 -0
- data/lib/precise/error_classes.rb +19 -0
- data/lib/precise/transcription.rb +11 -0
- data/lib/precise/transcription_a2r.rb +123 -0
- data/lib/precise/transcription_r2a.rb +447 -0
- data/lib/precise/types_list.rb +37 -0
- data/lib/precise/version.rb +5 -0
- data/lib/precise.rb +67 -0
- data/precise.gemspec +35 -0
- metadata +105 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: de5da64054f1063eb0129a6aa14d03bc457ba1cdc68d9748618210e0c414f001
|
4
|
+
data.tar.gz: 738931cb458919672e14d9af523e18bfc9aecbde2b3efecefc4b85daa0e44446
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7d6b2d48809192ad96abcc2401feb6598fdaed2a30f039b2d0f0da13fd98d9ac76b4b6233fbaf1e17e2fe4825c1530e97a692f0d9332380d67f0dbb22d295f9b
|
7
|
+
data.tar.gz: 896f8c9cf2f58415d623e26ec69491f5e697166057faa7b5a692c127a227dd73999ab96c0e40e2bb4f267bb7441cba78a62e669ddb71e4ec5f3f2603bff1fabf
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
precise (0.1.8)
|
5
|
+
progressbar
|
6
|
+
slop
|
7
|
+
tiny_color
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
ast (2.4.2)
|
13
|
+
coderay (1.1.3)
|
14
|
+
json (2.6.3)
|
15
|
+
language_server-protocol (3.17.0.3)
|
16
|
+
method_source (1.0.0)
|
17
|
+
minitest (5.18.0)
|
18
|
+
parallel (1.22.1)
|
19
|
+
parser (3.2.1.1)
|
20
|
+
ast (~> 2.4.1)
|
21
|
+
progressbar (1.13.0)
|
22
|
+
pry (0.14.2)
|
23
|
+
coderay (~> 1.1)
|
24
|
+
method_source (~> 1.0)
|
25
|
+
rainbow (3.1.1)
|
26
|
+
rake (13.0.6)
|
27
|
+
regexp_parser (2.7.0)
|
28
|
+
rexml (3.2.5)
|
29
|
+
rubocop (1.44.1)
|
30
|
+
json (~> 2.3)
|
31
|
+
parallel (~> 1.10)
|
32
|
+
parser (>= 3.2.0.0)
|
33
|
+
rainbow (>= 2.2.2, < 4.0)
|
34
|
+
regexp_parser (>= 1.8, < 3.0)
|
35
|
+
rexml (>= 3.2.5, < 4.0)
|
36
|
+
rubocop-ast (>= 1.24.1, < 2.0)
|
37
|
+
ruby-progressbar (~> 1.7)
|
38
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
39
|
+
rubocop-ast (1.27.0)
|
40
|
+
parser (>= 3.2.1.0)
|
41
|
+
rubocop-performance (1.15.2)
|
42
|
+
rubocop (>= 1.7.0, < 2.0)
|
43
|
+
rubocop-ast (>= 0.4.0)
|
44
|
+
ruby-progressbar (1.13.0)
|
45
|
+
slop (4.10.1)
|
46
|
+
standard (1.24.3)
|
47
|
+
language_server-protocol (~> 3.17.0.2)
|
48
|
+
rubocop (= 1.44.1)
|
49
|
+
rubocop-performance (= 1.15.2)
|
50
|
+
tiny_color (1.2.2)
|
51
|
+
unicode-display_width (2.4.2)
|
52
|
+
|
53
|
+
PLATFORMS
|
54
|
+
x86_64-linux
|
55
|
+
|
56
|
+
DEPENDENCIES
|
57
|
+
minitest
|
58
|
+
precise!
|
59
|
+
pry
|
60
|
+
rake
|
61
|
+
standard
|
62
|
+
|
63
|
+
BUNDLED WITH
|
64
|
+
2.4.1
|
data/LICENSE.md
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
GNU Lesser General Public License
|
2
|
+
=================================
|
3
|
+
|
4
|
+
_Version 3, 29 June 2007_
|
5
|
+
_Copyright © 2007 Free Software Foundation, Inc. <<http://fsf.org/>>_
|
6
|
+
|
7
|
+
Everyone is permitted to copy and distribute verbatim copies
|
8
|
+
of this license document, but changing it is not allowed.
|
9
|
+
|
10
|
+
|
11
|
+
This version of the GNU Lesser General Public License incorporates
|
12
|
+
the terms and conditions of version 3 of the GNU General Public
|
13
|
+
License, supplemented by the additional permissions listed below.
|
14
|
+
|
15
|
+
### 0. Additional Definitions
|
16
|
+
|
17
|
+
As used herein, “this License” refers to version 3 of the GNU Lesser
|
18
|
+
General Public License, and the “GNU GPL” refers to version 3 of the GNU
|
19
|
+
General Public License.
|
20
|
+
|
21
|
+
“The Library” refers to a covered work governed by this License,
|
22
|
+
other than an Application or a Combined Work as defined below.
|
23
|
+
|
24
|
+
An “Application” is any work that makes use of an interface provided
|
25
|
+
by the Library, but which is not otherwise based on the Library.
|
26
|
+
Defining a subclass of a class defined by the Library is deemed a mode
|
27
|
+
of using an interface provided by the Library.
|
28
|
+
|
29
|
+
A “Combined Work” is a work produced by combining or linking an
|
30
|
+
Application with the Library. The particular version of the Library
|
31
|
+
with which the Combined Work was made is also called the “Linked
|
32
|
+
Version”.
|
33
|
+
|
34
|
+
The “Minimal Corresponding Source” for a Combined Work means the
|
35
|
+
Corresponding Source for the Combined Work, excluding any source code
|
36
|
+
for portions of the Combined Work that, considered in isolation, are
|
37
|
+
based on the Application, and not on the Linked Version.
|
38
|
+
|
39
|
+
The “Corresponding Application Code” for a Combined Work means the
|
40
|
+
object code and/or source code for the Application, including any data
|
41
|
+
and utility programs needed for reproducing the Combined Work from the
|
42
|
+
Application, but excluding the System Libraries of the Combined Work.
|
43
|
+
|
44
|
+
### 1. Exception to Section 3 of the GNU GPL
|
45
|
+
|
46
|
+
You may convey a covered work under sections 3 and 4 of this License
|
47
|
+
without being bound by section 3 of the GNU GPL.
|
48
|
+
|
49
|
+
### 2. Conveying Modified Versions
|
50
|
+
|
51
|
+
If you modify a copy of the Library, and, in your modifications, a
|
52
|
+
facility refers to a function or data to be supplied by an Application
|
53
|
+
that uses the facility (other than as an argument passed when the
|
54
|
+
facility is invoked), then you may convey a copy of the modified
|
55
|
+
version:
|
56
|
+
|
57
|
+
* **a)** under this License, provided that you make a good faith effort to
|
58
|
+
ensure that, in the event an Application does not supply the
|
59
|
+
function or data, the facility still operates, and performs
|
60
|
+
whatever part of its purpose remains meaningful, or
|
61
|
+
|
62
|
+
* **b)** under the GNU GPL, with none of the additional permissions of
|
63
|
+
this License applicable to that copy.
|
64
|
+
|
65
|
+
### 3. Object Code Incorporating Material from Library Header Files
|
66
|
+
|
67
|
+
The object code form of an Application may incorporate material from
|
68
|
+
a header file that is part of the Library. You may convey such object
|
69
|
+
code under terms of your choice, provided that, if the incorporated
|
70
|
+
material is not limited to numerical parameters, data structure
|
71
|
+
layouts and accessors, or small macros, inline functions and templates
|
72
|
+
(ten or fewer lines in length), you do both of the following:
|
73
|
+
|
74
|
+
* **a)** Give prominent notice with each copy of the object code that the
|
75
|
+
Library is used in it and that the Library and its use are
|
76
|
+
covered by this License.
|
77
|
+
* **b)** Accompany the object code with a copy of the GNU GPL and this license
|
78
|
+
document.
|
79
|
+
|
80
|
+
### 4. Combined Works
|
81
|
+
|
82
|
+
You may convey a Combined Work under terms of your choice that,
|
83
|
+
taken together, effectively do not restrict modification of the
|
84
|
+
portions of the Library contained in the Combined Work and reverse
|
85
|
+
engineering for debugging such modifications, if you also do each of
|
86
|
+
the following:
|
87
|
+
|
88
|
+
* **a)** Give prominent notice with each copy of the Combined Work that
|
89
|
+
the Library is used in it and that the Library and its use are
|
90
|
+
covered by this License.
|
91
|
+
|
92
|
+
* **b)** Accompany the Combined Work with a copy of the GNU GPL and this license
|
93
|
+
document.
|
94
|
+
|
95
|
+
* **c)** For a Combined Work that displays copyright notices during
|
96
|
+
execution, include the copyright notice for the Library among
|
97
|
+
these notices, as well as a reference directing the user to the
|
98
|
+
copies of the GNU GPL and this license document.
|
99
|
+
|
100
|
+
* **d)** Do one of the following:
|
101
|
+
- **0)** Convey the Minimal Corresponding Source under the terms of this
|
102
|
+
License, and the Corresponding Application Code in a form
|
103
|
+
suitable for, and under terms that permit, the user to
|
104
|
+
recombine or relink the Application with a modified version of
|
105
|
+
the Linked Version to produce a modified Combined Work, in the
|
106
|
+
manner specified by section 6 of the GNU GPL for conveying
|
107
|
+
Corresponding Source.
|
108
|
+
- **1)** Use a suitable shared library mechanism for linking with the
|
109
|
+
Library. A suitable mechanism is one that **(a)** uses at run time
|
110
|
+
a copy of the Library already present on the user's computer
|
111
|
+
system, and **(b)** will operate properly with a modified version
|
112
|
+
of the Library that is interface-compatible with the Linked
|
113
|
+
Version.
|
114
|
+
|
115
|
+
* **e)** Provide Installation Information, but only if you would otherwise
|
116
|
+
be required to provide such information under section 6 of the
|
117
|
+
GNU GPL, and only to the extent that such information is
|
118
|
+
necessary to install and execute a modified version of the
|
119
|
+
Combined Work produced by recombining or relinking the
|
120
|
+
Application with a modified version of the Linked Version. (If
|
121
|
+
you use option **4d0**, the Installation Information must accompany
|
122
|
+
the Minimal Corresponding Source and Corresponding Application
|
123
|
+
Code. If you use option **4d1**, you must provide the Installation
|
124
|
+
Information in the manner specified by section 6 of the GNU GPL
|
125
|
+
for conveying Corresponding Source.)
|
126
|
+
|
127
|
+
### 5. Combined Libraries
|
128
|
+
|
129
|
+
You may place library facilities that are a work based on the
|
130
|
+
Library side by side in a single library together with other library
|
131
|
+
facilities that are not Applications and are not covered by this
|
132
|
+
License, and convey such a combined library under terms of your
|
133
|
+
choice, if you do both of the following:
|
134
|
+
|
135
|
+
* **a)** Accompany the combined library with a copy of the same work based
|
136
|
+
on the Library, uncombined with any other library facilities,
|
137
|
+
conveyed under the terms of this License.
|
138
|
+
* **b)** Give prominent notice with the combined library that part of it
|
139
|
+
is a work based on the Library, and explaining where to find the
|
140
|
+
accompanying uncombined form of the same work.
|
141
|
+
|
142
|
+
### 6. Revised Versions of the GNU Lesser General Public License
|
143
|
+
|
144
|
+
The Free Software Foundation may publish revised and/or new versions
|
145
|
+
of the GNU Lesser General Public License from time to time. Such new
|
146
|
+
versions will be similar in spirit to the present version, but may
|
147
|
+
differ in detail to address new problems or concerns.
|
148
|
+
|
149
|
+
Each version is given a distinguishing version number. If the
|
150
|
+
Library as you received it specifies that a certain numbered version
|
151
|
+
of the GNU Lesser General Public License “or any later version”
|
152
|
+
applies to it, you have the option of following the terms and
|
153
|
+
conditions either of that published version or of any later version
|
154
|
+
published by the Free Software Foundation. If the Library as you
|
155
|
+
received it does not specify a version number of the GNU Lesser
|
156
|
+
General Public License, you may choose any version of the GNU Lesser
|
157
|
+
General Public License ever published by the Free Software Foundation.
|
158
|
+
|
159
|
+
If the Library as you received it specifies that a proxy can decide
|
160
|
+
whether future versions of the GNU Lesser General Public License shall
|
161
|
+
apply, that proxy's public statement of acceptance of any version is
|
162
|
+
permanent authorization for you to choose that version for the
|
163
|
+
Library.
|
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
## Command line usage
|
2
|
+
|
3
|
+
Install the gem and see the help message by executing:
|
4
|
+
|
5
|
+
$ gem install precise
|
6
|
+
$ precise -h
|
7
|
+
|
8
|
+
Arabicise a string of Romanisation:
|
9
|
+
|
10
|
+
$ precise -T 'bi-smi llāhi al-raḥmani al-raḥīm' # -T removes Tashkeel
|
11
|
+
|
12
|
+
Romanise a string of Arabic (experimental):
|
13
|
+
|
14
|
+
$ precise 'بسم الله الرحمن الرحيم' # (not able to infer Tashkeel!)
|
15
|
+
|
16
|
+
## Usage inside of another application
|
17
|
+
|
18
|
+
Install the gem and add to the application's Gemfile by executing:
|
19
|
+
|
20
|
+
$ bundle add precise
|
21
|
+
$ bundle install
|
22
|
+
|
23
|
+
You can then access the API like so:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
require 'precise'
|
27
|
+
Precise::Transcription.reverse 'bi-smi llāhi al-raḥmani al-raḥīm'
|
28
|
+
Precise::Transcription.transcribe 'ﺐﺴﻣ ﺎﻠﻠﻫ ﺎﻟﺮﺤﻤﻧ ﺎﻟﺮﺤﻴﻣ'
|
29
|
+
```
|
30
|
+
|
31
|
+
## Development
|
32
|
+
|
33
|
+
After checking out the repository, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
34
|
+
|
35
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
36
|
+
|
37
|
+
Issues and PRs are welcome!
|
38
|
+
|
39
|
+
## Funding
|
40
|
+
|
41
|
+
This Gem was developed within the long-term research project [Bibliotheca Arabica](http://www.bibliotheca-arabica.de) hosted at the Saxon Academy of the Sciences and Humanities in Leipzig, Germany. _Bibliotheca Arabica_ is part of the [German Academies’ Programme](https://www.akademienunion.de/en/research/the-academies-programme) and funded by the Federal Republic of Germany and the Free State of Saxony.
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler'
|
4
|
+
Bundler.require
|
5
|
+
|
6
|
+
require 'bundler/gem_tasks'
|
7
|
+
require 'rake/testtask'
|
8
|
+
|
9
|
+
Rake::TestTask.new(:test) do |t|
|
10
|
+
t.libs << 'test'
|
11
|
+
t.libs << 'lib'
|
12
|
+
t.test_files = FileList['test/**/test_*.rb']
|
13
|
+
end
|
14
|
+
|
15
|
+
desc 'increase minor version number by one'
|
16
|
+
task :bump do
|
17
|
+
current = Precise::VERSION
|
18
|
+
new = current.split('.')
|
19
|
+
new[-1] = (new[-1].to_i+1).to_s
|
20
|
+
new = new.join('.')
|
21
|
+
version_file = 'lib/precise/version.rb'
|
22
|
+
File.write(version_file, File.read(version_file).gsub(current, new))
|
23
|
+
end
|
24
|
+
|
25
|
+
task :default do; system 'rake -T'; end
|
data/TODO.md
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
Refactor to follow the following pattern:
|
2
|
+
|
3
|
+
```ruby
|
4
|
+
|
5
|
+
# short, romanised root <=> word list for learners: https://wahiduddin.net/words/arabic_glossary.htm
|
6
|
+
# commercial root <=> word dict: http://www.arabicroot.com/Home/Introduction
|
7
|
+
# possibly a good idea to OCR wehr 5 and make a dict from that?
|
8
|
+
|
9
|
+
def arabic_roots(opts); ['ʿwl','msʾ'].reject{|r| !r.include? opts[:with_letter]}.compact; end #
|
10
|
+
|
11
|
+
# 2005: http://jeffcoombs.com/isri/Taghva2005b.pdf
|
12
|
+
# 2006: NN-based: https://ieeexplore.ieee.org/document/4115547
|
13
|
+
# 2007: https://ieeexplore.ieee.org/document/4230974/
|
14
|
+
# 2014: https://journals.sagepub.com/doi/abs/10.1177/0165551514526348?journalCode=jisb
|
15
|
+
# 2016: https://www.sciencedirect.com/science/article/pii/S1319157815001342
|
16
|
+
# 2015: https://www.sciencedirect.com/science/article/pii/S1319157815000166
|
17
|
+
# metastudy (also 2015): https://www.sciencedirect.com/science/article/pii/S1319157815000166
|
18
|
+
# 2017: https://www.accentsjournals.org/PaperDirectory/Journal/IJACR/2018/3/3.pdf
|
19
|
+
# anything newer???
|
20
|
+
# some of the above testable at: http://arabic.emi.ac.ma:8080/SafarWeb/faces/safar/morphology/stemmer.xhtml
|
21
|
+
|
22
|
+
def extract_root(word); {'ʿāʾila':'ʿwl','masāʾikà':'msʾ'}[word.to_sym]; end
|
23
|
+
|
24
|
+
# with the above two in place:
|
25
|
+
|
26
|
+
arabic = %w[ʿāʾila masāʾikà].map{|s|
|
27
|
+
words = s.split ' '
|
28
|
+
words.map{|w|
|
29
|
+
w.gsub! /āʾi/, arabic_roots(with_letter: 'ʾ').include?(extract_root(w)) ? 'āSTANDALONE_HAMZAi' : 'āYA_AS_HAMZA_CARRIERi'
|
30
|
+
[
|
31
|
+
{'YA_AS_HAMZA_CARRIER':'ﺉ', 'STANDALONE_HAMZA':'ﺀ'},
|
32
|
+
{'ʿ':'ﻉ', 'ā':'ﺍ', 'i':'ِ◌', 'l':'ﻝ', 'a':'َ◌', 'm':'ﻡ', 's':'ﺱ', 'k':'ﻙ', 'à':'َ◌'}
|
33
|
+
].each{|list| list.each{|k,v| w.gsub! k.to_s, v}}
|
34
|
+
w.gsub! /◌$/, 'ﺓ'
|
35
|
+
}
|
36
|
+
words.join(' ').gsub('◌','')
|
37
|
+
}
|
38
|
+
|
39
|
+
# use actual tests from current code instead; also generate more from existing known-good data!
|
40
|
+
|
41
|
+
tests = (arabic == ["ﻉﺎﺌِﻟَﺓ", "ﻢَﺳﺍﺀِﻙَﺓ"])
|
42
|
+
```
|
data/exe/precise
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module CoreExtensions
|
2
|
+
refine String do
|
3
|
+
def precise_titlecase
|
4
|
+
s = chars
|
5
|
+
s.map.with_index{|c,i|
|
6
|
+
!%w[a i u].include?(s[0]) && ((i==0 && self[0..1] != 'al') || (i==1 && %w[ʾ ʿ].include?(s[0]))) ?
|
7
|
+
c.upcase :
|
8
|
+
c
|
9
|
+
}.join
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
refine Array do
|
14
|
+
def each_utf8_encode
|
15
|
+
map{|e| e.to_s.encode('utf-8')}
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
refine Hash do
|
20
|
+
def keys_and_values_to_s
|
21
|
+
map{|k,v| [k.to_s, v.class == Array ? v.map{|e| e.to_s} : v.to_s]}.to_h
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
unless self.respond_to?(:dbg); $dbg = 0; def dbg str; puts str if $dbg > 0; end; end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Precise
|
2
|
+
class Error < StandardError; end
|
3
|
+
|
4
|
+
class TranscriptionError < StandardError
|
5
|
+
def initialize(msg="unable to transcribe input string", exception_type=:untranscribable)
|
6
|
+
@exception_type = exception_type
|
7
|
+
super(msg)
|
8
|
+
end
|
9
|
+
attr_reader :exception_type
|
10
|
+
end
|
11
|
+
|
12
|
+
class NotATranscriptionError < StandardError
|
13
|
+
def initialize(msg="input string is not (entirely) a romanisation of Arabic", exception_type=:untranscribable)
|
14
|
+
@exception_type = exception_type
|
15
|
+
super(msg)
|
16
|
+
end
|
17
|
+
attr_reader :exception_type
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Precise
|
2
|
+
class Transcription
|
3
|
+
def initialize(opts = {})
|
4
|
+
default_options = {punctuation: true, verbosity: 0}
|
5
|
+
@opts = default_options.merge(opts)
|
6
|
+
@opts[:verbosity] += 2 if @opts.delete(:verbose) == true
|
7
|
+
$dbg += @opts[:verbosity]
|
8
|
+
@out_chunks = []
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
module Precise
|
2
|
+
|
3
|
+
using CoreExtensions # the more generic ones
|
4
|
+
|
5
|
+
class Transcription
|
6
|
+
def transcription
|
7
|
+
@out_chunks
|
8
|
+
.map{|c| c
|
9
|
+
.gsub(/^m$/, 'mīlādī')
|
10
|
+
.gsub(/^h$/, 'hijrī')
|
11
|
+
.gsub(/^wāltī$/, 'wa-l-lātī')
|
12
|
+
.gsub(/^wālḏī$/, 'wa-l-lāḏī')
|
13
|
+
.gsub(/^hy$/, 'hiya')
|
14
|
+
.gsub(/^ʿlá$/, 'ʿalá')
|
15
|
+
.gsub(/^mn$/, 'min')
|
16
|
+
.gsub(/^yd$/, 'yad')
|
17
|
+
.gsub(/^fy$/, 'fī')
|
18
|
+
.gsub(/^lhā$/, 'lahā')}
|
19
|
+
.join(' ')
|
20
|
+
.gsub('؟','?')
|
21
|
+
.gsub('،',',')
|
22
|
+
.gsub(/\s+([[:punct:]]+)/,'\1')
|
23
|
+
.gsub(/(?!(\s+|^))\(\s+/, ' (')
|
24
|
+
end
|
25
|
+
|
26
|
+
A2R = A2RTable = {
|
27
|
+
"ال": "al-",
|
28
|
+
"ء": "ʾ",
|
29
|
+
"آ": "ʾā",
|
30
|
+
"أ": "ʾa",
|
31
|
+
"أُ": "ʾu",
|
32
|
+
"إ": "ʾi",
|
33
|
+
"ا": "ā",
|
34
|
+
"ب": "b",
|
35
|
+
"ة": "a",
|
36
|
+
"ت": "t",
|
37
|
+
"ث": "ṯ",
|
38
|
+
"ج": "ǧ",
|
39
|
+
"ح": "ḥ",
|
40
|
+
"خ": "ḫ",
|
41
|
+
"د": "d",
|
42
|
+
"ذ": "ḏ",
|
43
|
+
"ر": "r",
|
44
|
+
"ز": "z",
|
45
|
+
"س": "s",
|
46
|
+
"ش": "š",
|
47
|
+
"ص": "ṣ",
|
48
|
+
"ض": "ḍ",
|
49
|
+
"ط": "ṭ",
|
50
|
+
"ظ": "ẓ",
|
51
|
+
"ع": "ʿ",
|
52
|
+
"غ": "ġ",
|
53
|
+
"ف": "f",
|
54
|
+
"ق": "q",
|
55
|
+
"ك": "k",
|
56
|
+
"ل": "l",
|
57
|
+
"م": "m",
|
58
|
+
"ن": "n",
|
59
|
+
"ه": "h",
|
60
|
+
"و": ["ū", "w"],
|
61
|
+
"ى": "á",
|
62
|
+
"ي": ["ī", "y"],
|
63
|
+
"َ": "a",
|
64
|
+
"ُ": "u",
|
65
|
+
"ِ": "i",
|
66
|
+
"پ": "p",
|
67
|
+
"چ": "č",
|
68
|
+
"ژ": "ž",
|
69
|
+
"گ": "g",
|
70
|
+
"٠": "0",
|
71
|
+
"١": "1",
|
72
|
+
"٢": "2",
|
73
|
+
"٣": "3",
|
74
|
+
"٤": "4",
|
75
|
+
"٥": "5",
|
76
|
+
"٦": "6",
|
77
|
+
"٧": "7",
|
78
|
+
"٨": "8",
|
79
|
+
"٩": "9",
|
80
|
+
}.map{|k,v| [k.to_s, v]}.to_h
|
81
|
+
SHADDA=' ّ'.strip
|
82
|
+
|
83
|
+
def transcribe(arabic)
|
84
|
+
non_word_rgx = /([\s\d[:punct:]]+)/
|
85
|
+
in_chunks = arabic.split non_word_rgx
|
86
|
+
in_chunks.each.with_index do |chunk,i|
|
87
|
+
word = chunk
|
88
|
+
(next) if chunk.strip.empty?
|
89
|
+
(@out_chunks << chunk.strip; next) if chunk.match? non_word_rgx
|
90
|
+
chars = chunk.chars
|
91
|
+
skip = 0
|
92
|
+
(@out_chunks << '')
|
93
|
+
chars.each.with_index do |ch,j|
|
94
|
+
(skip-=1; next) if skip>0
|
95
|
+
(@out_chunks[-1] << A2R['ال']; skip+=1; next) if j==0 && word.match?(/^ال/)
|
96
|
+
out_char = nil
|
97
|
+
# و and ي:
|
98
|
+
# first in array is a long vowel,
|
99
|
+
# second in array is a consonant
|
100
|
+
if A2R[ch].class==Array
|
101
|
+
if j==0 || j+1==word.length
|
102
|
+
(@out_chunks[-1] << A2R[ch][-1]; next)
|
103
|
+
else
|
104
|
+
out_char = A2R[ch][0]
|
105
|
+
end
|
106
|
+
else
|
107
|
+
out_char = A2R[ch]
|
108
|
+
end
|
109
|
+
(@out_chunks[-1] << A2R[chars[j-1]]) if ch == SHADDA
|
110
|
+
(@out_chunks[-1] << out_char; next) if out_char
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.transcribe(arabic, opts={})
|
116
|
+
warn "Romanisation is incomplete.".yellow
|
117
|
+
warn "Consider adding short vowels by hand as needed.".yellow
|
118
|
+
obj = new(opts)
|
119
|
+
obj.transcribe(arabic)
|
120
|
+
return obj.transcription
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,447 @@
|
|
1
|
+
module Precise
|
2
|
+
|
3
|
+
using CoreExtensions # the more generic ones
|
4
|
+
|
5
|
+
module CoreExtensions # the ones specific to this module
|
6
|
+
refine String do
|
7
|
+
# default output is "with everything"
|
8
|
+
# so once something is set to false, it'll be removed
|
9
|
+
def apply_options(opts)
|
10
|
+
defaults = {punctuation: true, tashkeel: true, alif_variants: true}
|
11
|
+
opts = defaults.merge opts
|
12
|
+
s = self.dup
|
13
|
+
|
14
|
+
if !opts[:punctuation]
|
15
|
+
s = s.gsub(/[[:punct:]]+/,'')
|
16
|
+
end
|
17
|
+
|
18
|
+
if !opts[:tashkeel]
|
19
|
+
tashkeel = Precise::Transcription::Tashkeel
|
20
|
+
nonprintables = Precise::Transcription::Nonprintables
|
21
|
+
extraneous_chars = [tashkeel + nonprintables].join
|
22
|
+
s = s.gsub(/[#{extraneous_chars}]/,'')
|
23
|
+
end
|
24
|
+
|
25
|
+
if !opts[:alif_variants]
|
26
|
+
alif_variants = Precise::Transcription::AlifVariants
|
27
|
+
s = s.gsub(/[#{alif_variants}]/,'ا')
|
28
|
+
end
|
29
|
+
|
30
|
+
return s.strip
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Transcription
|
36
|
+
using Precise::CoreExtensions
|
37
|
+
|
38
|
+
# Ruby would have been fine with these in the file verbatim (on their own),
|
39
|
+
# alas, my editor's syntax highlighting can't cope, so doing it 1990s-style
|
40
|
+
Fatha, Kasra, Damma, Shadda = ["\u064e", "\u0650", "\u064f", "\u0651"].each_utf8_encode
|
41
|
+
# nonprintables
|
42
|
+
R2LM, L2RM, ZWNJ = ["\u200f", "\u200e", "\u200c"].each_utf8_encode
|
43
|
+
# typographic modifiers, ligatures, oft-used words
|
44
|
+
Tatweel, Allah = ["ـ", "الله"]
|
45
|
+
# the various forms of alif, ya and waw
|
46
|
+
AlifVariants = ['أ', 'إ', 'آ', 'ا', 'ٱ']
|
47
|
+
AlifHamzaAbove, AlifHamzaBelow, AlifMadda, Alif, AlifWasla = AlifVariants
|
48
|
+
YaHamzaAbove, Ya = ['ئ', 'ي']
|
49
|
+
WawHamzaAbove, Waw = ['ؤ', 'و']
|
50
|
+
# other character lists
|
51
|
+
Tashkeel = ("064B".to_i(16).."065B".to_i(16)).map{|dec| hex=("%04x" % dec); eval("char=\"\\u#{hex}\"; char")}
|
52
|
+
Nonprintables = [R2LM, L2RM]
|
53
|
+
|
54
|
+
R2ATables = {
|
55
|
+
# Adapted from the Transcription in the Brill PDF's "Note to the Indices":
|
56
|
+
# - a dash, depending on its position, denotes the start or end of the word
|
57
|
+
# - an array denotes the requirement for a choice to be made from context
|
58
|
+
# - any characters that are being replaced by DMG characters have been ommitted
|
59
|
+
common: {
|
60
|
+
ʾ: :ء,
|
61
|
+
b: :ب,
|
62
|
+
p: :پ,
|
63
|
+
t: :ت,
|
64
|
+
ḥ: :ح,
|
65
|
+
d: :د,
|
66
|
+
r: :ر,
|
67
|
+
z: :ز,
|
68
|
+
s: :س,
|
69
|
+
ṣ: :ص,
|
70
|
+
ḍ: :ض,
|
71
|
+
ṭ: :ط,
|
72
|
+
ẓ: :ظ,
|
73
|
+
ʿ: :ع,
|
74
|
+
f: :ف,
|
75
|
+
q: :ق,
|
76
|
+
k: :ك,
|
77
|
+
g: :گ,
|
78
|
+
l: :ل,
|
79
|
+
m: :م,
|
80
|
+
n: :ن,
|
81
|
+
h: :ه,
|
82
|
+
w: :و,
|
83
|
+
y: :ي,
|
84
|
+
ā: :ا,
|
85
|
+
ū: :و,
|
86
|
+
ī: :ي,
|
87
|
+
},
|
88
|
+
vowels: {
|
89
|
+
a: Fatha,
|
90
|
+
à: Fatha, # at word-end only
|
91
|
+
u: Damma,
|
92
|
+
i: Kasra,
|
93
|
+
},
|
94
|
+
combos: {
|
95
|
+
aw: :َو,
|
96
|
+
ay: :َي
|
97
|
+
},
|
98
|
+
brockelmann: {
|
99
|
+
'-a': :ة, # "-" = at word-end
|
100
|
+
'-at': :ة, # "-" = at word-end
|
101
|
+
'al-': :ال, # "-" = at word-start
|
102
|
+
},
|
103
|
+
dmg: {
|
104
|
+
ṯ: :ث,
|
105
|
+
ǧ: :ج,
|
106
|
+
č: :چ,
|
107
|
+
ḫ: :خ,
|
108
|
+
ḏ: :ذ,
|
109
|
+
ž: :ژ,
|
110
|
+
š: :ش,
|
111
|
+
ġ: :غ
|
112
|
+
},
|
113
|
+
uppercase: {
|
114
|
+
A: :أَ,
|
115
|
+
I: :إِ,
|
116
|
+
U: :أُ,
|
117
|
+
Y: :ي
|
118
|
+
},
|
119
|
+
farsi: {
|
120
|
+
v: :و, # always? what, e.g. about "Divbandi"?
|
121
|
+
e: [:ه, Fatha] # word-end, mid-word
|
122
|
+
},
|
123
|
+
turkic: {
|
124
|
+
ö: :و,
|
125
|
+
ü: Damma, # ???
|
126
|
+
ı: Kasra, # ???
|
127
|
+
E: :ا
|
128
|
+
},
|
129
|
+
indic: {
|
130
|
+
ō: :و # things like "Bh" => "بْ" would go here, too
|
131
|
+
},
|
132
|
+
romanic: {
|
133
|
+
c: :ث, # or should this rather be a س?
|
134
|
+
o: :و,
|
135
|
+
Ė: :إي,
|
136
|
+
x: :كس
|
137
|
+
},
|
138
|
+
semitic: {
|
139
|
+
ē: :ﺍ # is that always so?
|
140
|
+
},
|
141
|
+
finnic: {
|
142
|
+
ä: Fatha # in e.g. Mänglī
|
143
|
+
},
|
144
|
+
precise: {
|
145
|
+
á: :ى,
|
146
|
+
Ā: :آ, # don't add 'ʾĀ' here - it is considered an error in the input!
|
147
|
+
'ʾā': :آ # same but lowercase - alif madda in the middle of the word
|
148
|
+
}
|
149
|
+
}
|
150
|
+
|
151
|
+
PostR2AWordReplacements = {
|
152
|
+
/^(.*)لّاه/ => '\1 الله', # names ending in "allah"
|
153
|
+
/(ب\.|إبن|إِبن)/ => 'بن', # "son of"
|
154
|
+
/أَبي/ => 'أبي', # "father of" (gen.)
|
155
|
+
/أَبو/ => 'أبو', # "father of" (nom.)
|
156
|
+
/بَكر/ => 'بكر', # the name "bakr"
|
157
|
+
/عَلي/ => 'علي', # the name "ali"
|
158
|
+
/عَبد/ => 'عبد', # the name-part "abd"
|
159
|
+
/افندي/ => 'افندی' # ottoman/turkish effendi
|
160
|
+
# /([یي]زاده$)/ => ZWNJ+'ی'+ZWNJ+'زاده', # names ending in "-azade" # removed at DK's request
|
161
|
+
}
|
162
|
+
|
163
|
+
PostR2AContextReplacements = {
|
164
|
+
/((^|\.\s+)بن(\s+))/ => 'ابن\3', # exception: son-of in beginning of sentence
|
165
|
+
/(تِ|تُ|تَ)(\s+)/ => 'ة ', # this'll lose the case ending, but that's for the better
|
166
|
+
/داوود/ => 'داود' # not sure if this might actually hold true for all ...wū...?
|
167
|
+
}
|
168
|
+
|
169
|
+
PunctSepRgx = /[ \.\-\(\)\?\&=,;:]/
|
170
|
+
|
171
|
+
R2A = R2ATables.values.inject(:merge) # just one level is enough now
|
172
|
+
.keys_and_values_to_s # more convenient to work with
|
173
|
+
|
174
|
+
SunLetters = %w[t ṯ d ḏ r z s š ṣ ḍ ṭ ẓ l n]
|
175
|
+
RomanizedShortVowels = %w[a i u]
|
176
|
+
RomanizedLongVowels = %w[ā ū ī]
|
177
|
+
# "a" here because of ta'marbouta, "á" because of alif maqsoura, "ā" because of word-final alif mamdouda
|
178
|
+
RomanizedConsonantals = SunLetters + %w[m l k q f ġ ʿ ḫ ḥ h ǧ b ʾ a á]
|
179
|
+
ArabicScriptVowels = %w[ا ي و]
|
180
|
+
ArabicScriptConsonants = %w[ا ب ت ث ج ح خ س ش ص ض ط ظ ع غ ف ق ك ل م ن ه ي ئ ة ى أ إ ؤ ئ آ]
|
181
|
+
|
182
|
+
LatinChars = R2A.map{|l,a| l unless l.size != 1}.compact
|
183
|
+
TranslitChars_lowercase = 'ʾʿḏḥṣḍṭẓāūīṯǧčḫžšġōĖēáäüöü'
|
184
|
+
TranslitChars = (TranslitChars_lowercase + TranslitChars_lowercase.upcase).chars.uniq.join
|
185
|
+
|
186
|
+
def this_word(str, idx)
|
187
|
+
str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{TranslitChars}\w]+/] || '')
|
188
|
+
end
|
189
|
+
|
190
|
+
def this_word_and_the_next(str, idx)
|
191
|
+
# first part: from beginning of string to index position, get all non-whitespace characters
|
192
|
+
# second part: from index position to end of string,
|
193
|
+
# get all characters belonging to the word which the index position character belongs to,
|
194
|
+
# as well as the next word if any
|
195
|
+
if str.match?(/\s+/)
|
196
|
+
str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{@translit_chars}\w]+\s+[#{@translit_chars}\w]+/i] || '')
|
197
|
+
else
|
198
|
+
str
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def hamza_before_following(ch, pch, first_letter_of_word = false)
|
203
|
+
if first_letter_of_word
|
204
|
+
case ch.to_sym
|
205
|
+
when :a, :u then AlifHamzaAbove
|
206
|
+
when :i then AlifHamzaBelow
|
207
|
+
when :ā then AlifMadda
|
208
|
+
when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
|
209
|
+
when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
|
210
|
+
end
|
211
|
+
else
|
212
|
+
if %w[y ī].include? pch
|
213
|
+
# also take into account what PRECEDED the hamza - that might take precedence!
|
214
|
+
case ch.to_sym
|
215
|
+
when :a then YaHamzaAbove
|
216
|
+
when :i then YaHamzaAbove
|
217
|
+
when :u then WawHamzaAbove
|
218
|
+
when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
|
219
|
+
when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
|
220
|
+
end
|
221
|
+
else
|
222
|
+
case ch.to_sym
|
223
|
+
when :a then AlifHamzaAbove
|
224
|
+
when :i then YaHamzaAbove
|
225
|
+
when :u then
|
226
|
+
pch == 'ū' ? R2A['ʾ'] : WawHamzaAbove
|
227
|
+
when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
|
228
|
+
when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
def hamza_after_preceding(ch, first_letter_of_word = false)
|
235
|
+
if first_letter_of_word
|
236
|
+
case ch.to_sym
|
237
|
+
when :a then AlifHamzaAbove
|
238
|
+
when :u then R2A['ā']+Damma+WawHamzaAbove
|
239
|
+
when :i then R2A['ā']+YaHamzaAbove
|
240
|
+
end
|
241
|
+
else
|
242
|
+
case ch.to_sym
|
243
|
+
when :a then AlifHamzaAbove
|
244
|
+
when :i then YaHamzaAbove
|
245
|
+
when :u then WawHamzaAbove
|
246
|
+
when :ī then YaHamzaAbove
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def alif_for_word_initial_kasra(word)
|
252
|
+
# a,i,u = that specific short vowel
|
253
|
+
# c = any consonantal
|
254
|
+
# s = any short vowel
|
255
|
+
# l = any long vowel
|
256
|
+
patterns = [
|
257
|
+
'iCClC',
|
258
|
+
'iCCiCClC',
|
259
|
+
'iClCC'
|
260
|
+
]
|
261
|
+
# pp word
|
262
|
+
shorts = RomanizedShortVowels
|
263
|
+
longs = RomanizedLongVowels
|
264
|
+
consonants = RomanizedConsonantals
|
265
|
+
alif = Alif
|
266
|
+
patterns.each do |p|
|
267
|
+
# puts "> #{p}"
|
268
|
+
next unless word.size == p.size
|
269
|
+
match = true
|
270
|
+
word.chars.each_with_index do |c,i|
|
271
|
+
case p[i]
|
272
|
+
when 'C' then match = false unless consonants.include?(c)
|
273
|
+
when 's' then match = false unless shorts.include?(c)
|
274
|
+
when 'l' then match = false unless longs.include?(c)
|
275
|
+
else
|
276
|
+
match = false unless c == p[i]
|
277
|
+
end
|
278
|
+
# puts "after #{c}: #{match} (should have been #{p[i]})"
|
279
|
+
end
|
280
|
+
(match = false if word.downcase.match?(/^ist/)) # استـ introduces
|
281
|
+
(alif = AlifHamzaBelow; break) if match
|
282
|
+
end; puts "\t\tfor #{word}: word-initial #{alif}".light_blue if $dbg > 1
|
283
|
+
alif
|
284
|
+
end
|
285
|
+
|
286
|
+
def sanitize(str)
|
287
|
+
# remove nonprintables such as the ZWNJ
|
288
|
+
# FIXME: the erroneous_chars replacement table should have already taken care of this?!
|
289
|
+
["\u200c", "\u200f"].each{|ch| str.gsub! ch, ''}
|
290
|
+
# make letters following either ʿ or ʾ lowercase
|
291
|
+
lastc=''; str.chars.map{|c| c.downcase! if lastc.match?(/[ʿʾ]/); lastc=c}.join
|
292
|
+
end
|
293
|
+
|
294
|
+
# input: valid Precise string
|
295
|
+
# example: (al-)ʿAbbādī Muḥammad Ibn Aḥmad Ibn Muḥammad al-Harawī
|
296
|
+
# output: Arabic string
|
297
|
+
# example: العَبّادي مُحَمَّد بن أَحمَد بن مُحَمَّد الهَرَوي
|
298
|
+
def reverse(romanized)
|
299
|
+
raise Precise::NotATranscriptionError if romanized.nil?
|
300
|
+
|
301
|
+
# sure, it's called "Precise", but it should still be
|
302
|
+
# as tolerant as possible in what it accepts as input...
|
303
|
+
romanized = sanitize(romanized)
|
304
|
+
arabic = '' # we start with an empty string and go character by character
|
305
|
+
|
306
|
+
puts "- (#{romanized.size}) [#{romanized}]".light_green if $dbg > 1
|
307
|
+
|
308
|
+
# next, turn strings into character arrays
|
309
|
+
romanized = romanized.chars
|
310
|
+
arabic = arabic.chars
|
311
|
+
# to be able to merge 2 romanized characters into 1 arabic character
|
312
|
+
skip = false
|
313
|
+
# print string like so: ʿ·A·b·b·ā·d·ī· ·M·u·ḥ·a·m·m·a·d· ·I·b·n· ·A·ḥ·m·a·d· ·I·b·n· ...
|
314
|
+
puts "- (#{romanized.size}) [#{romanized.join('·')}]".light_green if $dbg > 1
|
315
|
+
|
316
|
+
# loop over the romanized character array, filling the arabic one up as we go
|
317
|
+
romanized.each_with_index do |ch,i|
|
318
|
+
# a little bit of context
|
319
|
+
pch = i == 0 ? nil : romanized[i-1]
|
320
|
+
fch = romanized[i+1]
|
321
|
+
ffch = romanized[i+2]
|
322
|
+
|
323
|
+
# multi-letter skip-aheads
|
324
|
+
if skip
|
325
|
+
dbg "\t\tskipping #{ch}"
|
326
|
+
if !(pch=='a' && fch=='-') # we're in the middle of "al-" (word-start)
|
327
|
+
skip=false; end; next; end
|
328
|
+
|
329
|
+
# symbols to remove from input
|
330
|
+
(dbg "\tskipping unprintable symbol"; next) if [ZWNJ].include?(ch)
|
331
|
+
|
332
|
+
# deal with alif madda before "normal" hamza rules follow
|
333
|
+
if ("#{ch}#{fch}".match?(/ʾā/) || "#{pch}#{ch}".match?(/^Ā/))
|
334
|
+
(dbg "\talif madda #{R2A['ʾā']}"; arabic << R2A['ʾā']; skip=true; next); end
|
335
|
+
|
336
|
+
# hamza followed by a short or long vowel
|
337
|
+
if ch == 'ʾ' && %w[a i u ā ī ū].include?(fch.to_s.downcase)
|
338
|
+
is_first_letter_of_word = (pch.nil? || pch.match(/\s+/))
|
339
|
+
(dbg "\t#{ch} with following #{fch}";
|
340
|
+
arabic << hamza_before_following(fch, pch, is_first_letter_of_word);
|
341
|
+
skip=true unless this_word(romanized.join, i).match?(/(a$|at($|\s))/)
|
342
|
+
next); end
|
343
|
+
# hamza preceded by a short vowel
|
344
|
+
# (beware of a possible alif madda (would be dealt with above, on the next round))
|
345
|
+
if fch.to_s == 'ʾ' && !ffch.to_s.match?(/[āĀ]/) && %w[a i u].include?(ch.downcase)
|
346
|
+
is_first_letter_of_word = (pch.nil? || pch.match(/\s+/))
|
347
|
+
(dbg "\t#{fch} carried on or following preceding #{ch}"
|
348
|
+
arabic << hamza_after_preceding(ch, is_first_letter_of_word); skip=true; next); end
|
349
|
+
|
350
|
+
# find the article "al", marked by having a dash appended to it
|
351
|
+
(dbg "\tarticle al- #{R2A['al-']}"; arabic << R2A['al-']; skip=true; next) if ("#{ch}#{fch}#{ffch}" == 'al-')
|
352
|
+
|
353
|
+
# unconditionally add spaces, dots and dashes to the output
|
354
|
+
(dbg "\tinitial only (#{pch}#{ch})"; arabic << ch; next) if ch=='.' && (fch.nil? || fch.match(/\s+/))
|
355
|
+
(dbg "\tnon-letter (#{ch})"; arabic << ch; next) if ch.match(PunctSepRgx) # white space or punctuation
|
356
|
+
|
357
|
+
# a word-initial "a" or "u" must always be preceded by "ʾ"; only "i" can possibly *not* have one
|
358
|
+
|
359
|
+
# deal with word-initial special cases
|
360
|
+
if pch.to_s.strip.empty? # either beginning of string or of word
|
361
|
+
if %w[a u].include?(ch)
|
362
|
+
(dbg "\tprepending #{ch} with hamza"; arabic << R2A[ch.upcase]; next); end
|
363
|
+
if ch == 'i'
|
364
|
+
(dbg "\thamza-less alif?"
|
365
|
+
context = this_word(romanized.join, i)
|
366
|
+
arabic << alif_for_word_initial_kasra(context.split(/^w?al-/).last)
|
367
|
+
next); end; end
|
368
|
+
|
369
|
+
# perform tashdeed
|
370
|
+
(out=R2A[ch]+Shadda; dbg "\ttashdeed of #{ch} #{out}"; arabic << out; skip = true; next) if R2A[ch] && ch==fch
|
371
|
+
|
372
|
+
# should there be a ta'marbouta or not at the end of the word?
|
373
|
+
context1 = this_word(romanized.join,i)
|
374
|
+
context2 = this_word_and_the_next(romanized.join,i)
|
375
|
+
if context1 == context2 # single word
|
376
|
+
if (i == context1.length-2 && "#{ch}#{fch}".match?(/at$/)) \
|
377
|
+
|| (i == context1.length-1 && "#{ch}#{fch}".match?(/a$/))
|
378
|
+
arabic << R2A['-at']+' '; skip=true; next
|
379
|
+
end
|
380
|
+
else # multiple words
|
381
|
+
if (i == context1.length-2 && "#{ch}#{fch}#{ffch}".match?(/at\s/))
|
382
|
+
arabic << R2A['-a']+' '; skip = true; next
|
383
|
+
elsif (i == context1.length-1 && "#{ch}#{fch}".match?(/a\s/))
|
384
|
+
arabic << R2A['-a']+' '; next
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
# letter ayn followed by uppercase vowel
|
389
|
+
if ch == 'ʿ'
|
390
|
+
(skip=true; ar=R2A[ch]) if %w[A I U].include?(fch)
|
391
|
+
case fch # ayn+following vowel at beginning of word
|
392
|
+
when 'A' then ar+=Fatha
|
393
|
+
when 'I' then ar+=Kasra
|
394
|
+
when 'U' then ar+=Damma; end; end
|
395
|
+
(dbg "\tayn+vowel #{ch}#{fch} #{ar}"; arabic << ar; next) if ar && ar.size==2
|
396
|
+
|
397
|
+
# long "a" at word-end: alif maqsoorah, otherwise normal alif
|
398
|
+
# "e" at word-end: letter hah, otherwise just a fatha
|
399
|
+
if R2A[ch].class == Array
|
400
|
+
choice = (fch.nil? || fch==' ') ? R2A[ch].first : R2A[ch].last
|
401
|
+
(dbg "\tcontextual #{ch} #{choice}"; arabic << choice; next); end
|
402
|
+
|
403
|
+
# exact match (pure transliteration, no transcription effort required)
|
404
|
+
(dbg "\tfrom table #{ch}→#{R2A[ch]}"; arabic << R2A[ch]; next) if R2A[ch]
|
405
|
+
|
406
|
+
# no luck yet; might be a regular uppercase letter
|
407
|
+
(dbg "\tuppercased #{ch} #{R2A[ch.downcase]}"; arabic << R2A[ch.downcase]; next) if R2A[ch.downcase]
|
408
|
+
|
409
|
+
# still no luck; last shot is punctuation
|
410
|
+
(dbg "\tinterpunctuation #{ch}"; arabic << ch; next) if ch.match?(/[[:punct:]]/)
|
411
|
+
|
412
|
+
# mark unknown characters as such; the philosophy here being that input to
|
413
|
+
# Precise should be pre-processed enough for this to never have to happen…
|
414
|
+
warn "Warning: character '#{ch}' is unknown to Precise and will be substituted by placeholder only".yellow
|
415
|
+
arabic << '�'
|
416
|
+
end
|
417
|
+
|
418
|
+
# character-array to word-array
|
419
|
+
arabic = arabic.compact.join.split
|
420
|
+
# العأَبّادي محمّد إِبن أَحمد إِبن محمّد للهروي (but with () around "al")
|
421
|
+
puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1
|
422
|
+
|
423
|
+
# dragnet replacement of special words, such as changing "ibn" into "bin"
|
424
|
+
2.times.each_with_index do |i|
|
425
|
+
puts "#{' '*6}(postprocessing round #{i+1})".light_green if $dbg > 1
|
426
|
+
PostR2AWordReplacements.each{|rgx,subst|
|
427
|
+
arabic.map!{|w|
|
428
|
+
puts "#{' '*8}word match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (w.match(rgx) && $dbg > 1)
|
429
|
+
w.gsub(/-/, '') # dashes not needed anymore now
|
430
|
+
.gsub(rgx, subst)} }
|
431
|
+
end
|
432
|
+
|
433
|
+
# some rules apply only in the context of words, not letters
|
434
|
+
puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1
|
435
|
+
arabic = arabic.join(' ')
|
436
|
+
PostR2AContextReplacements.each{|rgx,subst|
|
437
|
+
puts "#{' '*8}context match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (arabic.match(rgx) && $dbg > 1)
|
438
|
+
arabic.gsub!(rgx, subst) }
|
439
|
+
|
440
|
+
return arabic.apply_options(@opts)
|
441
|
+
end
|
442
|
+
|
443
|
+
def self.reverse(romanized, opts={})
|
444
|
+
new(opts).reverse(romanized)
|
445
|
+
end
|
446
|
+
end
|
447
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module Precise
|
4
|
+
using CoreExtensions
|
5
|
+
|
6
|
+
class TypesList
|
7
|
+
@@types = nil
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
resdir = File.join __dir__,'..','..','res'
|
11
|
+
FileUtils.mkdir_p resdir
|
12
|
+
typesfile = File.absolute_path(File.join resdir,'types.lst')
|
13
|
+
download(typesfile) unless File.exist? typesfile
|
14
|
+
@types ||= File.readlines typesfile, chomp: true
|
15
|
+
end
|
16
|
+
|
17
|
+
def download(path)
|
18
|
+
puts 'downloading types database (only needed once)...'
|
19
|
+
require 'net/http'
|
20
|
+
require 'open-uri'
|
21
|
+
require 'progressbar'
|
22
|
+
url = 'https://raw.githubusercontent.com/sixtyfive/arabic-types/main/types.lst'
|
23
|
+
data = URI.open(url)
|
24
|
+
IO.copy_stream data, path
|
25
|
+
end
|
26
|
+
|
27
|
+
def percentage_of_tokens_present(string)
|
28
|
+
words = string.split
|
29
|
+
n_present = words.map{|w| @types.include? w}.count(true)
|
30
|
+
100.0 / words.length * n_present
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.percentage_of_tokens_present(string)
|
34
|
+
new.percentage_of_tokens_present(string)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/precise.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'slop'
|
3
|
+
require 'yaml'
|
4
|
+
require 'tiny_color'
|
5
|
+
|
6
|
+
deps = %w[version debugging error_classes core_extensions transcription transcription_r2a transcription_a2r types_list]
|
7
|
+
deps.each{|d| require_relative File.join(__dir__,'..','lib','precise',d)}
|
8
|
+
|
9
|
+
module Precise
|
10
|
+
class CLI
|
11
|
+
def initialize
|
12
|
+
opts = Slop::Options.new
|
13
|
+
opts.banner = "Usage: precise [options] <string(s)>\n"
|
14
|
+
opts.separator " where options can be:\n"
|
15
|
+
alif_variants = Precise::Transcription::AlifVariants
|
16
|
+
opts.bool "-s", "--show-rules", "print the list of rules which are applied for transcription"
|
17
|
+
opts.bool "-c", "--confidence", "also print the percentage of output words appearing in a large corpus of Arabic"
|
18
|
+
opts.bool "-A", "--no-alif-variants", "all of #{alif_variants.join("، ")} will be merged into ا"
|
19
|
+
opts.bool "-T", "--no-tashkeel", "diacritics (and non printables, such as tatweel) will be removed from output"
|
20
|
+
opts.bool "-P", "--no-punctuation", "all punctuation characters will be discarded from output"
|
21
|
+
opts.bool "-v", "--verbose", "instruct the backend classes to output debugging and plausibility information"
|
22
|
+
opts.bool "-h", "--help", "display this message"
|
23
|
+
opts.separator "\n Transcription direction is determined by presence of characters from the 'Arabic' Unicode block.\n" \
|
24
|
+
" At present, Arabic-to-Roman transcription is only rudimentary."
|
25
|
+
opts = Slop::Parser.new(opts)
|
26
|
+
|
27
|
+
begin
|
28
|
+
@opts = opts.parse(ARGV)
|
29
|
+
usage if @opts[:help] || ARGV.size == 0
|
30
|
+
rules if @opts.to_h[:show_rules]
|
31
|
+
rescue
|
32
|
+
@opts = opts.parse([])
|
33
|
+
usage
|
34
|
+
end
|
35
|
+
|
36
|
+
options = {verbose: @opts[:verbose]}
|
37
|
+
options[:alif_variants] = false if @opts.to_h[:no_alif_variants]
|
38
|
+
options[:tashkeel] = false if @opts.to_h[:no_tashkeel]
|
39
|
+
options[:punctuation] = false if @opts.to_h[:no_punctuation]
|
40
|
+
|
41
|
+
instr = @opts.arguments.join(' ')
|
42
|
+
if instr.match?(/\p{Arabic}/)
|
43
|
+
outstr = Precise::Transcription.transcribe(instr.dup, options)
|
44
|
+
else
|
45
|
+
outstr = Precise::Transcription.reverse(instr.dup, options)
|
46
|
+
outstr += " (#{Precise::TypesList::percentage_of_tokens_present(outstr)}%)" if @opts[:confidence]
|
47
|
+
end
|
48
|
+
puts outstr.pretty_inspect.gsub(/(^"|"$)/, "").strip
|
49
|
+
end
|
50
|
+
|
51
|
+
def usage
|
52
|
+
warn @opts
|
53
|
+
exit
|
54
|
+
end
|
55
|
+
|
56
|
+
def rules
|
57
|
+
puts Precise::Transcription::R2ATables.map{|k,v| Hash[k.to_s,v.map{|kk,vv| Hash[kk.to_s,vv]}]}.to_yaml.gsub(/---\n/,'')
|
58
|
+
exit
|
59
|
+
end
|
60
|
+
|
61
|
+
def nopts
|
62
|
+
@opts.to_h.values.map { |o| o || nil }.compact.size
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.start; new; end
|
66
|
+
end
|
67
|
+
end
|
data/precise.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/precise/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'precise'
|
7
|
+
spec.version = Precise::VERSION
|
8
|
+
spec.authors = ['J. R. Schmid']
|
9
|
+
spec.email = ['jrs+git@weitnahbei.de']
|
10
|
+
|
11
|
+
spec.summary = 'Arabic to DMG-like (but more precise) and back'
|
12
|
+
spec.description = 'Romanise Arabic script, arabicise romanisations of Arabic script back into Latin script '
|
13
|
+
spec.homepage = 'https://rubygems.org/gems/precise'
|
14
|
+
spec.required_ruby_version = '>= 2.7.0'
|
15
|
+
|
16
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
17
|
+
spec.metadata['source_code_uri'] = 'https://github.com/sixtyfive/precise.git'
|
18
|
+
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
22
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
23
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
24
|
+
end
|
25
|
+
end
|
26
|
+
spec.bindir = 'exe'
|
27
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
28
|
+
spec.require_paths = ['lib']
|
29
|
+
|
30
|
+
# dependencies
|
31
|
+
|
32
|
+
spec.add_dependency 'slop'
|
33
|
+
spec.add_dependency 'tiny_color'
|
34
|
+
spec.add_dependency 'progressbar'
|
35
|
+
end
|
metadata
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: precise
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.8
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- J. R. Schmid
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-03-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: slop
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: tiny_color
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: progressbar
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: 'Romanise Arabic script, arabicise romanisations of Arabic script back
|
56
|
+
into Latin script '
|
57
|
+
email:
|
58
|
+
- jrs+git@weitnahbei.de
|
59
|
+
executables:
|
60
|
+
- precise
|
61
|
+
extensions: []
|
62
|
+
extra_rdoc_files: []
|
63
|
+
files:
|
64
|
+
- Gemfile
|
65
|
+
- Gemfile.lock
|
66
|
+
- LICENSE.md
|
67
|
+
- README.md
|
68
|
+
- Rakefile
|
69
|
+
- TODO.md
|
70
|
+
- exe/precise
|
71
|
+
- lib/precise.rb
|
72
|
+
- lib/precise/core_extensions.rb
|
73
|
+
- lib/precise/debugging.rb
|
74
|
+
- lib/precise/error_classes.rb
|
75
|
+
- lib/precise/transcription.rb
|
76
|
+
- lib/precise/transcription_a2r.rb
|
77
|
+
- lib/precise/transcription_r2a.rb
|
78
|
+
- lib/precise/types_list.rb
|
79
|
+
- lib/precise/version.rb
|
80
|
+
- precise.gemspec
|
81
|
+
homepage: https://rubygems.org/gems/precise
|
82
|
+
licenses: []
|
83
|
+
metadata:
|
84
|
+
homepage_uri: https://rubygems.org/gems/precise
|
85
|
+
source_code_uri: https://github.com/sixtyfive/precise.git
|
86
|
+
post_install_message:
|
87
|
+
rdoc_options: []
|
88
|
+
require_paths:
|
89
|
+
- lib
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: 2.7.0
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
requirements: []
|
101
|
+
rubygems_version: 3.3.25
|
102
|
+
signing_key:
|
103
|
+
specification_version: 4
|
104
|
+
summary: Arabic to DMG-like (but more precise) and back
|
105
|
+
test_files: []
|