precise 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: de5da64054f1063eb0129a6aa14d03bc457ba1cdc68d9748618210e0c414f001
4
+ data.tar.gz: 738931cb458919672e14d9af523e18bfc9aecbde2b3efecefc4b85daa0e44446
5
+ SHA512:
6
+ metadata.gz: 7d6b2d48809192ad96abcc2401feb6598fdaed2a30f039b2d0f0da13fd98d9ac76b4b6233fbaf1e17e2fe4825c1530e97a692f0d9332380d67f0dbb22d295f9b
7
+ data.tar.gz: 896f8c9cf2f58415d623e26ec69491f5e697166057faa7b5a692c127a227dd73999ab96c0e40e2bb4f267bb7441cba78a62e669ddb71e4ec5f3f2603bff1fabf
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # this gem's dependencies are specified in precise.gemspec
4
+ gemspec
5
+ # below are development dependencies only
6
+ gem 'rake'
7
+ gem 'minitest'
8
+ gem 'standard'
9
+ gem 'pry'
data/Gemfile.lock ADDED
@@ -0,0 +1,64 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ precise (0.1.8)
5
+ progressbar
6
+ slop
7
+ tiny_color
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ ast (2.4.2)
13
+ coderay (1.1.3)
14
+ json (2.6.3)
15
+ language_server-protocol (3.17.0.3)
16
+ method_source (1.0.0)
17
+ minitest (5.18.0)
18
+ parallel (1.22.1)
19
+ parser (3.2.1.1)
20
+ ast (~> 2.4.1)
21
+ progressbar (1.13.0)
22
+ pry (0.14.2)
23
+ coderay (~> 1.1)
24
+ method_source (~> 1.0)
25
+ rainbow (3.1.1)
26
+ rake (13.0.6)
27
+ regexp_parser (2.7.0)
28
+ rexml (3.2.5)
29
+ rubocop (1.44.1)
30
+ json (~> 2.3)
31
+ parallel (~> 1.10)
32
+ parser (>= 3.2.0.0)
33
+ rainbow (>= 2.2.2, < 4.0)
34
+ regexp_parser (>= 1.8, < 3.0)
35
+ rexml (>= 3.2.5, < 4.0)
36
+ rubocop-ast (>= 1.24.1, < 2.0)
37
+ ruby-progressbar (~> 1.7)
38
+ unicode-display_width (>= 2.4.0, < 3.0)
39
+ rubocop-ast (1.27.0)
40
+ parser (>= 3.2.1.0)
41
+ rubocop-performance (1.15.2)
42
+ rubocop (>= 1.7.0, < 2.0)
43
+ rubocop-ast (>= 0.4.0)
44
+ ruby-progressbar (1.13.0)
45
+ slop (4.10.1)
46
+ standard (1.24.3)
47
+ language_server-protocol (~> 3.17.0.2)
48
+ rubocop (= 1.44.1)
49
+ rubocop-performance (= 1.15.2)
50
+ tiny_color (1.2.2)
51
+ unicode-display_width (2.4.2)
52
+
53
+ PLATFORMS
54
+ x86_64-linux
55
+
56
+ DEPENDENCIES
57
+ minitest
58
+ precise!
59
+ pry
60
+ rake
61
+ standard
62
+
63
+ BUNDLED WITH
64
+ 2.4.1
data/LICENSE.md ADDED
@@ -0,0 +1,163 @@
1
+ GNU Lesser General Public License
2
+ =================================
3
+
4
+ _Version 3, 29 June 2007_
5
+ _Copyright © 2007 Free Software Foundation, Inc. &lt;<http://fsf.org/>&gt;_
6
+
7
+ Everyone is permitted to copy and distribute verbatim copies
8
+ of this license document, but changing it is not allowed.
9
+
10
+
11
+ This version of the GNU Lesser General Public License incorporates
12
+ the terms and conditions of version 3 of the GNU General Public
13
+ License, supplemented by the additional permissions listed below.
14
+
15
+ ### 0. Additional Definitions
16
+
17
+ As used herein, “this License” refers to version 3 of the GNU Lesser
18
+ General Public License, and the “GNU GPL” refers to version 3 of the GNU
19
+ General Public License.
20
+
21
+ “The Library” refers to a covered work governed by this License,
22
+ other than an Application or a Combined Work as defined below.
23
+
24
+ An “Application” is any work that makes use of an interface provided
25
+ by the Library, but which is not otherwise based on the Library.
26
+ Defining a subclass of a class defined by the Library is deemed a mode
27
+ of using an interface provided by the Library.
28
+
29
+ A “Combined Work” is a work produced by combining or linking an
30
+ Application with the Library. The particular version of the Library
31
+ with which the Combined Work was made is also called the “Linked
32
+ Version”.
33
+
34
+ The “Minimal Corresponding Source” for a Combined Work means the
35
+ Corresponding Source for the Combined Work, excluding any source code
36
+ for portions of the Combined Work that, considered in isolation, are
37
+ based on the Application, and not on the Linked Version.
38
+
39
+ The “Corresponding Application Code” for a Combined Work means the
40
+ object code and/or source code for the Application, including any data
41
+ and utility programs needed for reproducing the Combined Work from the
42
+ Application, but excluding the System Libraries of the Combined Work.
43
+
44
+ ### 1. Exception to Section 3 of the GNU GPL
45
+
46
+ You may convey a covered work under sections 3 and 4 of this License
47
+ without being bound by section 3 of the GNU GPL.
48
+
49
+ ### 2. Conveying Modified Versions
50
+
51
+ If you modify a copy of the Library, and, in your modifications, a
52
+ facility refers to a function or data to be supplied by an Application
53
+ that uses the facility (other than as an argument passed when the
54
+ facility is invoked), then you may convey a copy of the modified
55
+ version:
56
+
57
+ * **a)** under this License, provided that you make a good faith effort to
58
+ ensure that, in the event an Application does not supply the
59
+ function or data, the facility still operates, and performs
60
+ whatever part of its purpose remains meaningful, or
61
+
62
+ * **b)** under the GNU GPL, with none of the additional permissions of
63
+ this License applicable to that copy.
64
+
65
+ ### 3. Object Code Incorporating Material from Library Header Files
66
+
67
+ The object code form of an Application may incorporate material from
68
+ a header file that is part of the Library. You may convey such object
69
+ code under terms of your choice, provided that, if the incorporated
70
+ material is not limited to numerical parameters, data structure
71
+ layouts and accessors, or small macros, inline functions and templates
72
+ (ten or fewer lines in length), you do both of the following:
73
+
74
+ * **a)** Give prominent notice with each copy of the object code that the
75
+ Library is used in it and that the Library and its use are
76
+ covered by this License.
77
+ * **b)** Accompany the object code with a copy of the GNU GPL and this license
78
+ document.
79
+
80
+ ### 4. Combined Works
81
+
82
+ You may convey a Combined Work under terms of your choice that,
83
+ taken together, effectively do not restrict modification of the
84
+ portions of the Library contained in the Combined Work and reverse
85
+ engineering for debugging such modifications, if you also do each of
86
+ the following:
87
+
88
+ * **a)** Give prominent notice with each copy of the Combined Work that
89
+ the Library is used in it and that the Library and its use are
90
+ covered by this License.
91
+
92
+ * **b)** Accompany the Combined Work with a copy of the GNU GPL and this license
93
+ document.
94
+
95
+ * **c)** For a Combined Work that displays copyright notices during
96
+ execution, include the copyright notice for the Library among
97
+ these notices, as well as a reference directing the user to the
98
+ copies of the GNU GPL and this license document.
99
+
100
+ * **d)** Do one of the following:
101
+ - **0)** Convey the Minimal Corresponding Source under the terms of this
102
+ License, and the Corresponding Application Code in a form
103
+ suitable for, and under terms that permit, the user to
104
+ recombine or relink the Application with a modified version of
105
+ the Linked Version to produce a modified Combined Work, in the
106
+ manner specified by section 6 of the GNU GPL for conveying
107
+ Corresponding Source.
108
+ - **1)** Use a suitable shared library mechanism for linking with the
109
+ Library. A suitable mechanism is one that **(a)** uses at run time
110
+ a copy of the Library already present on the user's computer
111
+ system, and **(b)** will operate properly with a modified version
112
+ of the Library that is interface-compatible with the Linked
113
+ Version.
114
+
115
+ * **e)** Provide Installation Information, but only if you would otherwise
116
+ be required to provide such information under section 6 of the
117
+ GNU GPL, and only to the extent that such information is
118
+ necessary to install and execute a modified version of the
119
+ Combined Work produced by recombining or relinking the
120
+ Application with a modified version of the Linked Version. (If
121
+ you use option **4d0**, the Installation Information must accompany
122
+ the Minimal Corresponding Source and Corresponding Application
123
+ Code. If you use option **4d1**, you must provide the Installation
124
+ Information in the manner specified by section 6 of the GNU GPL
125
+ for conveying Corresponding Source.)
126
+
127
+ ### 5. Combined Libraries
128
+
129
+ You may place library facilities that are a work based on the
130
+ Library side by side in a single library together with other library
131
+ facilities that are not Applications and are not covered by this
132
+ License, and convey such a combined library under terms of your
133
+ choice, if you do both of the following:
134
+
135
+ * **a)** Accompany the combined library with a copy of the same work based
136
+ on the Library, uncombined with any other library facilities,
137
+ conveyed under the terms of this License.
138
+ * **b)** Give prominent notice with the combined library that part of it
139
+ is a work based on the Library, and explaining where to find the
140
+ accompanying uncombined form of the same work.
141
+
142
+ ### 6. Revised Versions of the GNU Lesser General Public License
143
+
144
+ The Free Software Foundation may publish revised and/or new versions
145
+ of the GNU Lesser General Public License from time to time. Such new
146
+ versions will be similar in spirit to the present version, but may
147
+ differ in detail to address new problems or concerns.
148
+
149
+ Each version is given a distinguishing version number. If the
150
+ Library as you received it specifies that a certain numbered version
151
+ of the GNU Lesser General Public License “or any later version”
152
+ applies to it, you have the option of following the terms and
153
+ conditions either of that published version or of any later version
154
+ published by the Free Software Foundation. If the Library as you
155
+ received it does not specify a version number of the GNU Lesser
156
+ General Public License, you may choose any version of the GNU Lesser
157
+ General Public License ever published by the Free Software Foundation.
158
+
159
+ If the Library as you received it specifies that a proxy can decide
160
+ whether future versions of the GNU Lesser General Public License shall
161
+ apply, that proxy's public statement of acceptance of any version is
162
+ permanent authorization for you to choose that version for the
163
+ Library.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ ## Command line usage
2
+
3
+ Install the gem and see the help message by executing:
4
+
5
+ $ gem install precise
6
+ $ precise -h
7
+
8
+ Arabicise a string of Romanisation:
9
+
10
+ $ precise -T 'bi-smi llāhi al-raḥmani al-raḥīm' # -T removes Tashkeel
11
+
12
+ Romanise a string of Arabic (experimental):
13
+
14
+ $ precise 'بسم الله الرحمن الرحيم' # (not able to infer Tashkeel!)
15
+
16
+ ## Usage inside of another application
17
+
18
+ Install the gem and add to the application's Gemfile by executing:
19
+
20
+ $ bundle add precise
21
+ $ bundle install
22
+
23
+ You can then access the API like so:
24
+
25
+ ```ruby
26
+ require 'precise'
27
+ Precise::Transcription.reverse 'bi-smi llāhi al-raḥmani al-raḥīm'
28
+ Precise::Transcription.transcribe 'ﺐﺴﻣ ﺎﻠﻠﻫ ﺎﻟﺮﺤﻤﻧ ﺎﻟﺮﺤﻴﻣ'
29
+ ```
30
+
31
+ ## Development
32
+
33
+ After checking out the repository, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
34
+
35
+ To install this gem onto your local machine, run `bundle exec rake install`.
36
+
37
+ Issues and PRs are welcome!
38
+
39
+ ## Funding
40
+
41
+ This Gem was developed within the long-term research project [Bibliotheca Arabica](http://www.bibliotheca-arabica.de) hosted at the Saxon Academy of the Sciences and Humanities in Leipzig, Germany. _Bibliotheca Arabica_ is part of the [German Academies’ Programme](https://www.akademienunion.de/en/research/the-academies-programme) and funded by the Federal Republic of Germany and the Free State of Saxony.
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler'
4
+ Bundler.require
5
+
6
+ require 'bundler/gem_tasks'
7
+ require 'rake/testtask'
8
+
9
+ Rake::TestTask.new(:test) do |t|
10
+ t.libs << 'test'
11
+ t.libs << 'lib'
12
+ t.test_files = FileList['test/**/test_*.rb']
13
+ end
14
+
15
+ desc 'increase minor version number by one'
16
+ task :bump do
17
+ current = Precise::VERSION
18
+ new = current.split('.')
19
+ new[-1] = (new[-1].to_i+1).to_s
20
+ new = new.join('.')
21
+ version_file = 'lib/precise/version.rb'
22
+ File.write(version_file, File.read(version_file).gsub(current, new))
23
+ end
24
+
25
+ task :default do; system 'rake -T'; end
data/TODO.md ADDED
@@ -0,0 +1,42 @@
1
+ Refactor to follow the following pattern:
2
+
3
+ ```ruby
4
+
5
+ # short, romanised root <=> word list for learners: https://wahiduddin.net/words/arabic_glossary.htm
6
+ # commercial root <=> word dict: http://www.arabicroot.com/Home/Introduction
7
+ # possibly a good idea to OCR wehr 5 and make a dict from that?
8
+
9
+ def arabic_roots(opts); ['ʿwl','msʾ'].reject{|r| !r.include? opts[:with_letter]}.compact; end #
10
+
11
+ # 2005: http://jeffcoombs.com/isri/Taghva2005b.pdf
12
+ # 2006: NN-based: https://ieeexplore.ieee.org/document/4115547
13
+ # 2007: https://ieeexplore.ieee.org/document/4230974/
14
+ # 2014: https://journals.sagepub.com/doi/abs/10.1177/0165551514526348?journalCode=jisb
15
+ # 2016: https://www.sciencedirect.com/science/article/pii/S1319157815001342
16
+ # 2015: https://www.sciencedirect.com/science/article/pii/S1319157815000166
17
+ # metastudy (also 2015): https://www.sciencedirect.com/science/article/pii/S1319157815000166
18
+ # 2017: https://www.accentsjournals.org/PaperDirectory/Journal/IJACR/2018/3/3.pdf
19
+ # anything newer???
20
+ # some of the above testable at: http://arabic.emi.ac.ma:8080/SafarWeb/faces/safar/morphology/stemmer.xhtml
21
+
22
+ def extract_root(word); {'ʿāʾila':'ʿwl','masāʾikà':'msʾ'}[word.to_sym]; end
23
+
24
+ # with the above two in place:
25
+
26
+ arabic = %w[ʿāʾila masāʾikà].map{|s|
27
+ words = s.split ' '
28
+ words.map{|w|
29
+ w.gsub! /āʾi/, arabic_roots(with_letter: 'ʾ').include?(extract_root(w)) ? 'āSTANDALONE_HAMZAi' : 'āYA_AS_HAMZA_CARRIERi'
30
+ [
31
+ {'YA_AS_HAMZA_CARRIER':'ﺉ', 'STANDALONE_HAMZA':'ﺀ'},
32
+ {'ʿ':'ﻉ', 'ā':'ﺍ', 'i':'ِ◌', 'l':'ﻝ', 'a':'َ◌', 'm':'ﻡ', 's':'ﺱ', 'k':'ﻙ', 'à':'َ◌'}
33
+ ].each{|list| list.each{|k,v| w.gsub! k.to_s, v}}
34
+ w.gsub! /◌$/, 'ﺓ'
35
+ }
36
+ words.join(' ').gsub('◌','')
37
+ }
38
+
39
+ # use actual tests from current code instead; also generate more from existing known-good data!
40
+
41
+ tests = (arabic == ["ﻉﺎﺌِﻟَﺓ", "ﻢَﺳﺍﺀِﻙَﺓ"])
42
+ ```
data/exe/precise ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ f = File.basename(__FILE__, '.rb')
4
+ if __FILE__.match(/exe\/#{f}$/)
5
+ require_relative "../lib/#{f}"
6
+ else
7
+ require f
8
+ end
9
+
10
+ Precise::CLI.start
@@ -0,0 +1,24 @@
1
+ module CoreExtensions
2
+ refine String do
3
+ def precise_titlecase
4
+ s = chars
5
+ s.map.with_index{|c,i|
6
+ !%w[a i u].include?(s[0]) && ((i==0 && self[0..1] != 'al') || (i==1 && %w[ʾ ʿ].include?(s[0]))) ?
7
+ c.upcase :
8
+ c
9
+ }.join
10
+ end
11
+ end
12
+
13
+ refine Array do
14
+ def each_utf8_encode
15
+ map{|e| e.to_s.encode('utf-8')}
16
+ end
17
+ end
18
+
19
+ refine Hash do
20
+ def keys_and_values_to_s
21
+ map{|k,v| [k.to_s, v.class == Array ? v.map{|e| e.to_s} : v.to_s]}.to_h
22
+ end
23
+ end
24
+ end
@@ -0,0 +1 @@
1
+ unless self.respond_to?(:dbg); $dbg = 0; def dbg str; puts str if $dbg > 0; end; end
@@ -0,0 +1,19 @@
1
+ module Precise
2
+ class Error < StandardError; end
3
+
4
+ class TranscriptionError < StandardError
5
+ def initialize(msg="unable to transcribe input string", exception_type=:untranscribable)
6
+ @exception_type = exception_type
7
+ super(msg)
8
+ end
9
+ attr_reader :exception_type
10
+ end
11
+
12
+ class NotATranscriptionError < StandardError
13
+ def initialize(msg="input string is not (entirely) a romanisation of Arabic", exception_type=:untranscribable)
14
+ @exception_type = exception_type
15
+ super(msg)
16
+ end
17
+ attr_reader :exception_type
18
+ end
19
+ end
@@ -0,0 +1,11 @@
1
+ module Precise
2
+ class Transcription
3
+ def initialize(opts = {})
4
+ default_options = {punctuation: true, verbosity: 0}
5
+ @opts = default_options.merge(opts)
6
+ @opts[:verbosity] += 2 if @opts.delete(:verbose) == true
7
+ $dbg += @opts[:verbosity]
8
+ @out_chunks = []
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,123 @@
1
+ module Precise
2
+
3
+ using CoreExtensions # the more generic ones
4
+
5
+ class Transcription
6
+ def transcription
7
+ @out_chunks
8
+ .map{|c| c
9
+ .gsub(/^m$/, 'mīlādī')
10
+ .gsub(/^h$/, 'hijrī')
11
+ .gsub(/^wāltī$/, 'wa-l-lātī')
12
+ .gsub(/^wālḏī$/, 'wa-l-lāḏī')
13
+ .gsub(/^hy$/, 'hiya')
14
+ .gsub(/^ʿlá$/, 'ʿalá')
15
+ .gsub(/^mn$/, 'min')
16
+ .gsub(/^yd$/, 'yad')
17
+ .gsub(/^fy$/, 'fī')
18
+ .gsub(/^lhā$/, 'lahā')}
19
+ .join(' ')
20
+ .gsub('؟','?')
21
+ .gsub('،',',')
22
+ .gsub(/\s+([[:punct:]]+)/,'\1')
23
+ .gsub(/(?!(\s+|^))\(\s+/, ' (')
24
+ end
25
+
26
+ A2R = A2RTable = {
27
+ "ال": "al-",
28
+ "ء": "ʾ",
29
+ "آ": "ʾā",
30
+ "أ": "ʾa",
31
+ "أُ": "ʾu",
32
+ "إ": "ʾi",
33
+ "ا": "ā",
34
+ "ب": "b",
35
+ "ة": "a",
36
+ "ت": "t",
37
+ "ث": "ṯ",
38
+ "ج": "ǧ",
39
+ "ح": "ḥ",
40
+ "خ": "ḫ",
41
+ "د": "d",
42
+ "ذ": "ḏ",
43
+ "ر": "r",
44
+ "ز": "z",
45
+ "س": "s",
46
+ "ش": "š",
47
+ "ص": "ṣ",
48
+ "ض": "ḍ",
49
+ "ط": "ṭ",
50
+ "ظ": "ẓ",
51
+ "ع": "ʿ",
52
+ "غ": "ġ",
53
+ "ف": "f",
54
+ "ق": "q",
55
+ "ك": "k",
56
+ "ل": "l",
57
+ "م": "m",
58
+ "ن": "n",
59
+ "ه": "h",
60
+ "و": ["ū", "w"],
61
+ "ى": "á",
62
+ "ي": ["ī", "y"],
63
+ "َ": "a",
64
+ "ُ": "u",
65
+ "ِ": "i",
66
+ "پ": "p",
67
+ "چ": "č",
68
+ "ژ": "ž",
69
+ "گ": "g",
70
+ "٠": "0",
71
+ "١": "1",
72
+ "٢": "2",
73
+ "٣": "3",
74
+ "٤": "4",
75
+ "٥": "5",
76
+ "٦": "6",
77
+ "٧": "7",
78
+ "٨": "8",
79
+ "٩": "9",
80
+ }.map{|k,v| [k.to_s, v]}.to_h
81
+ SHADDA=' ّ'.strip
82
+
83
+ def transcribe(arabic)
84
+ non_word_rgx = /([\s\d[:punct:]]+)/
85
+ in_chunks = arabic.split non_word_rgx
86
+ in_chunks.each.with_index do |chunk,i|
87
+ word = chunk
88
+ (next) if chunk.strip.empty?
89
+ (@out_chunks << chunk.strip; next) if chunk.match? non_word_rgx
90
+ chars = chunk.chars
91
+ skip = 0
92
+ (@out_chunks << '')
93
+ chars.each.with_index do |ch,j|
94
+ (skip-=1; next) if skip>0
95
+ (@out_chunks[-1] << A2R['ال']; skip+=1; next) if j==0 && word.match?(/^ال/)
96
+ out_char = nil
97
+ # و and ي:
98
+ # first in array is a long vowel,
99
+ # second in array is a consonant
100
+ if A2R[ch].class==Array
101
+ if j==0 || j+1==word.length
102
+ (@out_chunks[-1] << A2R[ch][-1]; next)
103
+ else
104
+ out_char = A2R[ch][0]
105
+ end
106
+ else
107
+ out_char = A2R[ch]
108
+ end
109
+ (@out_chunks[-1] << A2R[chars[j-1]]) if ch == SHADDA
110
+ (@out_chunks[-1] << out_char; next) if out_char
111
+ end
112
+ end
113
+ end
114
+
115
+ def self.transcribe(arabic, opts={})
116
+ warn "Romanisation is incomplete.".yellow
117
+ warn "Consider adding short vowels by hand as needed.".yellow
118
+ obj = new(opts)
119
+ obj.transcribe(arabic)
120
+ return obj.transcription
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,447 @@
1
+ module Precise
2
+
3
+ using CoreExtensions # the more generic ones
4
+
5
+ module CoreExtensions # the ones specific to this module
6
+ refine String do
7
+ # default output is "with everything"
8
+ # so once something is set to false, it'll be removed
9
+ def apply_options(opts)
10
+ defaults = {punctuation: true, tashkeel: true, alif_variants: true}
11
+ opts = defaults.merge opts
12
+ s = self.dup
13
+
14
+ if !opts[:punctuation]
15
+ s = s.gsub(/[[:punct:]]+/,'')
16
+ end
17
+
18
+ if !opts[:tashkeel]
19
+ tashkeel = Precise::Transcription::Tashkeel
20
+ nonprintables = Precise::Transcription::Nonprintables
21
+ extraneous_chars = [tashkeel + nonprintables].join
22
+ s = s.gsub(/[#{extraneous_chars}]/,'')
23
+ end
24
+
25
+ if !opts[:alif_variants]
26
+ alif_variants = Precise::Transcription::AlifVariants
27
+ s = s.gsub(/[#{alif_variants}]/,'ا')
28
+ end
29
+
30
+ return s.strip
31
+ end
32
+ end
33
+ end
34
+
35
+ class Transcription
36
+ using Precise::CoreExtensions
37
+
38
+ # Ruby would have been fine with these in the file verbatim (on their own),
39
+ # alas, my editor's syntax highlighting can't cope, so doing it 1990s-style
40
+ Fatha, Kasra, Damma, Shadda = ["\u064e", "\u0650", "\u064f", "\u0651"].each_utf8_encode
41
+ # nonprintables
42
+ R2LM, L2RM, ZWNJ = ["\u200f", "\u200e", "\u200c"].each_utf8_encode
43
+ # typographic modifiers, ligatures, oft-used words
44
+ Tatweel, Allah = ["ـ", "الله‎"]
45
+ # the various forms of alif, ya and waw
46
+ AlifVariants = ['أ', 'إ', 'آ', 'ا', 'ٱ']
47
+ AlifHamzaAbove, AlifHamzaBelow, AlifMadda, Alif, AlifWasla = AlifVariants
48
+ YaHamzaAbove, Ya = ['ئ', 'ي']
49
+ WawHamzaAbove, Waw = ['ؤ', 'و']
50
+ # other character lists
51
+ Tashkeel = ("064B".to_i(16).."065B".to_i(16)).map{|dec| hex=("%04x" % dec); eval("char=\"\\u#{hex}\"; char")}
52
+ Nonprintables = [R2LM, L2RM]
53
+
54
+ R2ATables = {
55
+ # Adapted from the Transcription in the Brill PDF's "Note to the Indices":
56
+ # - a dash, depending on its position, denotes the start or end of the word
57
+ # - an array denotes the requirement for a choice to be made from context
58
+ # - any characters that are being replaced by DMG characters have been ommitted
59
+ common: {
60
+ ʾ: :ء,
61
+ b: :ب,
62
+ p: :پ,
63
+ t: :ت,
64
+ ḥ: :ح,
65
+ d: :د,
66
+ r: :ر,
67
+ z: :ز,
68
+ s: :س,
69
+ ṣ: :ص,
70
+ ḍ: :ض,
71
+ ṭ: :ط,
72
+ ẓ: :ظ,
73
+ ʿ: :ع,
74
+ f: :ف,
75
+ q: :ق,
76
+ k: :ك,
77
+ g: :گ,
78
+ l: :ل,
79
+ m: :م,
80
+ n: :ن,
81
+ h: :ه,
82
+ w: :و,
83
+ y: :ي,
84
+ ā: :ا,
85
+ ū: :و,
86
+ ī: :ي,
87
+ },
88
+ vowels: {
89
+ a: Fatha,
90
+ à: Fatha, # at word-end only
91
+ u: Damma,
92
+ i: Kasra,
93
+ },
94
+ combos: {
95
+ aw: :َو,
96
+ ay: :َي
97
+ },
98
+ brockelmann: {
99
+ '-a': :ة, # "-" = at word-end
100
+ '-at': :ة, # "-" = at word-end
101
+ 'al-': :ال, # "-" = at word-start
102
+ },
103
+ dmg: {
104
+ ṯ: :ث,
105
+ ǧ: :ج,
106
+ č: :چ,
107
+ ḫ: :خ,
108
+ ḏ: :ذ,
109
+ ž: :ژ,
110
+ š: :ش,
111
+ ġ: :غ
112
+ },
113
+ uppercase: {
114
+ A: :أَ,
115
+ I: :إِ,
116
+ U: :أُ,
117
+ Y: :ي
118
+ },
119
+ farsi: {
120
+ v: :و, # always? what, e.g. about "Divbandi"?
121
+ e: [:ه, Fatha] # word-end, mid-word
122
+ },
123
+ turkic: {
124
+ ö: :و,
125
+ ü: Damma, # ???
126
+ ı: Kasra, # ???
127
+ E: :ا
128
+ },
129
+ indic: {
130
+ ō: :و # things like "Bh" => "بْ" would go here, too
131
+ },
132
+ romanic: {
133
+ c: :ث, # or should this rather be a س?
134
+ o: :و,
135
+ Ė: :إي,
136
+ x: :كس
137
+ },
138
+ semitic: {
139
+ ē: :ﺍ # is that always so?
140
+ },
141
+ finnic: {
142
+ ä: Fatha # in e.g. Mänglī
143
+ },
144
+ precise: {
145
+ á: :ى,
146
+ Ā: :آ, # don't add 'ʾĀ' here - it is considered an error in the input!
147
+ 'ʾā': :آ # same but lowercase - alif madda in the middle of the word
148
+ }
149
+ }
150
+
151
+ PostR2AWordReplacements = {
152
+ /^(.*)لّاه/ => '\1 الله', # names ending in "allah"
153
+ /(ب\.|إبن|إِبن)/ => 'بن', # "son of"
154
+ /أَبي/ => 'أبي', # "father of" (gen.)
155
+ /أَبو/ => 'أبو', # "father of" (nom.)
156
+ /بَكر/ => 'بكر', # the name "bakr"
157
+ /عَلي/ => 'علي', # the name "ali"
158
+ /عَبد/ => 'عبد', # the name-part "abd"
159
+ /افندي/ => 'افندی' # ottoman/turkish effendi
160
+ # /([یي]زاده$)/ => ZWNJ+'ی'+ZWNJ+'زاده', # names ending in "-azade" # removed at DK's request
161
+ }
162
+
163
+ PostR2AContextReplacements = {
164
+ /((^|\.\s+)بن(\s+))/ => 'ابن\3', # exception: son-of in beginning of sentence
165
+ /(تِ|تُ|تَ)(\s+)/ => 'ة ', # this'll lose the case ending, but that's for the better
166
+ /داوود/ => 'داود' # not sure if this might actually hold true for all ...wū...?
167
+ }
168
+
169
+ PunctSepRgx = /[ \.\-\(\)\?\&=,;:]/
170
+
171
+ R2A = R2ATables.values.inject(:merge) # just one level is enough now
172
+ .keys_and_values_to_s # more convenient to work with
173
+
174
+ SunLetters = %w[t ṯ d ḏ r z s š ṣ ḍ ṭ ẓ l n]
175
+ RomanizedShortVowels = %w[a i u]
176
+ RomanizedLongVowels = %w[ā ū ī]
177
+ # "a" here because of ta'marbouta, "á" because of alif maqsoura, "ā" because of word-final alif mamdouda
178
+ RomanizedConsonantals = SunLetters + %w[m l k q f ġ ʿ ḫ ḥ h ǧ b ʾ a á]
179
+ ArabicScriptVowels = %w[ا ي و]
180
+ ArabicScriptConsonants = %w[ا ب ت ث ج ح خ س ش ص ض ط ظ ع غ ف ق ك ل م ن ه ي ئ ة ى أ إ ؤ ئ آ]
181
+
182
+ LatinChars = R2A.map{|l,a| l unless l.size != 1}.compact
183
+ TranslitChars_lowercase = 'ʾʿḏḥṣḍṭẓāūīṯǧčḫžšġōĖēáäüöü'
184
+ TranslitChars = (TranslitChars_lowercase + TranslitChars_lowercase.upcase).chars.uniq.join
185
+
186
+ def this_word(str, idx)
187
+ str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{TranslitChars}\w]+/] || '')
188
+ end
189
+
190
+ def this_word_and_the_next(str, idx)
191
+ # first part: from beginning of string to index position, get all non-whitespace characters
192
+ # second part: from index position to end of string,
193
+ # get all characters belonging to the word which the index position character belongs to,
194
+ # as well as the next word if any
195
+ if str.match?(/\s+/)
196
+ str[0...idx][/\S*\z/] + (str[idx..-1][/\A[#{@translit_chars}\w]+\s+[#{@translit_chars}\w]+/i] || '')
197
+ else
198
+ str
199
+ end
200
+ end
201
+
202
+ def hamza_before_following(ch, pch, first_letter_of_word = false)
203
+ if first_letter_of_word
204
+ case ch.to_sym
205
+ when :a, :u then AlifHamzaAbove
206
+ when :i then AlifHamzaBelow
207
+ when :ā then AlifMadda
208
+ when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
209
+ when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
210
+ end
211
+ else
212
+ if %w[y ī].include? pch
213
+ # also take into account what PRECEDED the hamza - that might take precedence!
214
+ case ch.to_sym
215
+ when :a then YaHamzaAbove
216
+ when :i then YaHamzaAbove
217
+ when :u then WawHamzaAbove
218
+ when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
219
+ when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
220
+ end
221
+ else
222
+ case ch.to_sym
223
+ when :a then AlifHamzaAbove
224
+ when :i then YaHamzaAbove
225
+ when :u then
226
+ pch == 'ū' ? R2A['ʾ'] : WawHamzaAbove
227
+ when :ī then "#{YaHamzaAbove}#{R2A[ch]}"
228
+ when :ū then "#{WawHamzaAbove}#{R2A[ch]}"
229
+ end
230
+ end
231
+ end
232
+ end
233
+
234
+ def hamza_after_preceding(ch, first_letter_of_word = false)
235
+ if first_letter_of_word
236
+ case ch.to_sym
237
+ when :a then AlifHamzaAbove
238
+ when :u then R2A['ā']+Damma+WawHamzaAbove
239
+ when :i then R2A['ā']+YaHamzaAbove
240
+ end
241
+ else
242
+ case ch.to_sym
243
+ when :a then AlifHamzaAbove
244
+ when :i then YaHamzaAbove
245
+ when :u then WawHamzaAbove
246
+ when :ī then YaHamzaAbove
247
+ end
248
+ end
249
+ end
250
+
251
+ def alif_for_word_initial_kasra(word)
252
+ # a,i,u = that specific short vowel
253
+ # c = any consonantal
254
+ # s = any short vowel
255
+ # l = any long vowel
256
+ patterns = [
257
+ 'iCClC',
258
+ 'iCCiCClC',
259
+ 'iClCC'
260
+ ]
261
+ # pp word
262
+ shorts = RomanizedShortVowels
263
+ longs = RomanizedLongVowels
264
+ consonants = RomanizedConsonantals
265
+ alif = Alif
266
+ patterns.each do |p|
267
+ # puts "> #{p}"
268
+ next unless word.size == p.size
269
+ match = true
270
+ word.chars.each_with_index do |c,i|
271
+ case p[i]
272
+ when 'C' then match = false unless consonants.include?(c)
273
+ when 's' then match = false unless shorts.include?(c)
274
+ when 'l' then match = false unless longs.include?(c)
275
+ else
276
+ match = false unless c == p[i]
277
+ end
278
+ # puts "after #{c}: #{match} (should have been #{p[i]})"
279
+ end
280
+ (match = false if word.downcase.match?(/^ist/)) # استـ introduces
281
+ (alif = AlifHamzaBelow; break) if match
282
+ end; puts "\t\tfor #{word}: word-initial #{alif}".light_blue if $dbg > 1
283
+ alif
284
+ end
285
+
286
+ def sanitize(str)
287
+ # remove nonprintables such as the ZWNJ
288
+ # FIXME: the erroneous_chars replacement table should have already taken care of this?!
289
+ ["\u200c", "\u200f"].each{|ch| str.gsub! ch, ''}
290
+ # make letters following either ʿ or ʾ lowercase
291
+ lastc=''; str.chars.map{|c| c.downcase! if lastc.match?(/[ʿʾ]/); lastc=c}.join
292
+ end
293
+
294
+ # input: valid Precise string
295
+ # example: (al-)ʿAbbādī Muḥammad Ibn Aḥmad Ibn Muḥammad al-Harawī
296
+ # output: Arabic string
297
+ # example: العَبّادي مُحَمَّد بن أَحمَد بن مُحَمَّد الهَرَوي
298
+ def reverse(romanized)
299
+ raise Precise::NotATranscriptionError if romanized.nil?
300
+
301
+ # sure, it's called "Precise", but it should still be
302
+ # as tolerant as possible in what it accepts as input...
303
+ romanized = sanitize(romanized)
304
+ arabic = '' # we start with an empty string and go character by character
305
+
306
+ puts "- (#{romanized.size}) [#{romanized}]".light_green if $dbg > 1
307
+
308
+ # next, turn strings into character arrays
309
+ romanized = romanized.chars
310
+ arabic = arabic.chars
311
+ # to be able to merge 2 romanized characters into 1 arabic character
312
+ skip = false
313
+ # print string like so: ʿ·A·b·b·ā·d·ī· ·M·u·ḥ·a·m·m·a·d· ·I·b·n· ·A·ḥ·m·a·d· ·I·b·n· ...
314
+ puts "- (#{romanized.size}) [#{romanized.join('·')}]".light_green if $dbg > 1
315
+
316
+ # loop over the romanized character array, filling the arabic one up as we go
317
+ romanized.each_with_index do |ch,i|
318
+ # a little bit of context
319
+ pch = i == 0 ? nil : romanized[i-1]
320
+ fch = romanized[i+1]
321
+ ffch = romanized[i+2]
322
+
323
+ # multi-letter skip-aheads
324
+ if skip
325
+ dbg "\t\tskipping #{ch}"
326
+ if !(pch=='a' && fch=='-') # we're in the middle of "al-" (word-start)
327
+ skip=false; end; next; end
328
+
329
+ # symbols to remove from input
330
+ (dbg "\tskipping unprintable symbol"; next) if [ZWNJ].include?(ch)
331
+
332
+ # deal with alif madda before "normal" hamza rules follow
333
+ if ("#{ch}#{fch}".match?(/ʾā/) || "#{pch}#{ch}".match?(/^Ā/))
334
+ (dbg "\talif madda #{R2A['ʾā']}"; arabic << R2A['ʾā']; skip=true; next); end
335
+
336
+ # hamza followed by a short or long vowel
337
+ if ch == 'ʾ' && %w[a i u ā ī ū].include?(fch.to_s.downcase)
338
+ is_first_letter_of_word = (pch.nil? || pch.match(/\s+/))
339
+ (dbg "\t#{ch} with following #{fch}";
340
+ arabic << hamza_before_following(fch, pch, is_first_letter_of_word);
341
+ skip=true unless this_word(romanized.join, i).match?(/(a$|at($|\s))/)
342
+ next); end
343
+ # hamza preceded by a short vowel
344
+ # (beware of a possible alif madda (would be dealt with above, on the next round))
345
+ if fch.to_s == 'ʾ' && !ffch.to_s.match?(/[āĀ]/) && %w[a i u].include?(ch.downcase)
346
+ is_first_letter_of_word = (pch.nil? || pch.match(/\s+/))
347
+ (dbg "\t#{fch} carried on or following preceding #{ch}"
348
+ arabic << hamza_after_preceding(ch, is_first_letter_of_word); skip=true; next); end
349
+
350
+ # find the article "al", marked by having a dash appended to it
351
+ (dbg "\tarticle al- #{R2A['al-']}"; arabic << R2A['al-']; skip=true; next) if ("#{ch}#{fch}#{ffch}" == 'al-')
352
+
353
+ # unconditionally add spaces, dots and dashes to the output
354
+ (dbg "\tinitial only (#{pch}#{ch})"; arabic << ch; next) if ch=='.' && (fch.nil? || fch.match(/\s+/))
355
+ (dbg "\tnon-letter (#{ch})"; arabic << ch; next) if ch.match(PunctSepRgx) # white space or punctuation
356
+
357
+ # a word-initial "a" or "u" must always be preceded by "ʾ"; only "i" can possibly *not* have one
358
+
359
+ # deal with word-initial special cases
360
+ if pch.to_s.strip.empty? # either beginning of string or of word
361
+ if %w[a u].include?(ch)
362
+ (dbg "\tprepending #{ch} with hamza"; arabic << R2A[ch.upcase]; next); end
363
+ if ch == 'i'
364
+ (dbg "\thamza-less alif?"
365
+ context = this_word(romanized.join, i)
366
+ arabic << alif_for_word_initial_kasra(context.split(/^w?al-/).last)
367
+ next); end; end
368
+
369
+ # perform tashdeed
370
+ (out=R2A[ch]+Shadda; dbg "\ttashdeed of #{ch} #{out}"; arabic << out; skip = true; next) if R2A[ch] && ch==fch
371
+
372
+ # should there be a ta'marbouta or not at the end of the word?
373
+ context1 = this_word(romanized.join,i)
374
+ context2 = this_word_and_the_next(romanized.join,i)
375
+ if context1 == context2 # single word
376
+ if (i == context1.length-2 && "#{ch}#{fch}".match?(/at$/)) \
377
+ || (i == context1.length-1 && "#{ch}#{fch}".match?(/a$/))
378
+ arabic << R2A['-at']+' '; skip=true; next
379
+ end
380
+ else # multiple words
381
+ if (i == context1.length-2 && "#{ch}#{fch}#{ffch}".match?(/at\s/))
382
+ arabic << R2A['-a']+' '; skip = true; next
383
+ elsif (i == context1.length-1 && "#{ch}#{fch}".match?(/a\s/))
384
+ arabic << R2A['-a']+' '; next
385
+ end
386
+ end
387
+
388
+ # letter ayn followed by uppercase vowel
389
+ if ch == 'ʿ'
390
+ (skip=true; ar=R2A[ch]) if %w[A I U].include?(fch)
391
+ case fch # ayn+following vowel at beginning of word
392
+ when 'A' then ar+=Fatha
393
+ when 'I' then ar+=Kasra
394
+ when 'U' then ar+=Damma; end; end
395
+ (dbg "\tayn+vowel #{ch}#{fch} #{ar}"; arabic << ar; next) if ar && ar.size==2
396
+
397
+ # long "a" at word-end: alif maqsoorah, otherwise normal alif
398
+ # "e" at word-end: letter hah, otherwise just a fatha
399
+ if R2A[ch].class == Array
400
+ choice = (fch.nil? || fch==' ') ? R2A[ch].first : R2A[ch].last
401
+ (dbg "\tcontextual #{ch} #{choice}"; arabic << choice; next); end
402
+
403
+ # exact match (pure transliteration, no transcription effort required)
404
+ (dbg "\tfrom table #{ch}→#{R2A[ch]}"; arabic << R2A[ch]; next) if R2A[ch]
405
+
406
+ # no luck yet; might be a regular uppercase letter
407
+ (dbg "\tuppercased #{ch} #{R2A[ch.downcase]}"; arabic << R2A[ch.downcase]; next) if R2A[ch.downcase]
408
+
409
+ # still no luck; last shot is punctuation
410
+ (dbg "\tinterpunctuation #{ch}"; arabic << ch; next) if ch.match?(/[[:punct:]]/)
411
+
412
+ # mark unknown characters as such; the philosophy here being that input to
413
+ # Precise should be pre-processed enough for this to never have to happen…
414
+ warn "Warning: character '#{ch}' is unknown to Precise and will be substituted by placeholder only".yellow
415
+ arabic << '�'
416
+ end
417
+
418
+ # character-array to word-array
419
+ arabic = arabic.compact.join.split
420
+ # العأَبّادي محمّد إِبن أَحمد إِبن محمّد للهروي (but with () around "al")
421
+ puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1
422
+
423
+ # dragnet replacement of special words, such as changing "ibn" into "bin"
424
+ 2.times.each_with_index do |i|
425
+ puts "#{' '*6}(postprocessing round #{i+1})".light_green if $dbg > 1
426
+ PostR2AWordReplacements.each{|rgx,subst|
427
+ arabic.map!{|w|
428
+ puts "#{' '*8}word match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (w.match(rgx) && $dbg > 1)
429
+ w.gsub(/-/, '') # dashes not needed anymore now
430
+ .gsub(rgx, subst)} }
431
+ end
432
+
433
+ # some rules apply only in the context of words, not letters
434
+ puts "- (#{arabic.join(' ').size-1}) [#{L2RM+arabic.join(' ')+L2RM}]".light_green if $dbg > 1
435
+ arabic = arabic.join(' ')
436
+ PostR2AContextReplacements.each{|rgx,subst|
437
+ puts "#{' '*8}context match: #{L2RM}#{rgx.inspect} #{L2RM}=> #{L2RM}'#{subst}'".green if (arabic.match(rgx) && $dbg > 1)
438
+ arabic.gsub!(rgx, subst) }
439
+
440
+ return arabic.apply_options(@opts)
441
+ end
442
+
443
+ def self.reverse(romanized, opts={})
444
+ new(opts).reverse(romanized)
445
+ end
446
+ end
447
+ end
@@ -0,0 +1,37 @@
1
+ require 'fileutils'
2
+
3
+ module Precise
4
+ using CoreExtensions
5
+
6
+ class TypesList
7
+ @@types = nil
8
+
9
+ def initialize
10
+ resdir = File.join __dir__,'..','..','res'
11
+ FileUtils.mkdir_p resdir
12
+ typesfile = File.absolute_path(File.join resdir,'types.lst')
13
+ download(typesfile) unless File.exist? typesfile
14
+ @types ||= File.readlines typesfile, chomp: true
15
+ end
16
+
17
+ def download(path)
18
+ puts 'downloading types database (only needed once)...'
19
+ require 'net/http'
20
+ require 'open-uri'
21
+ require 'progressbar'
22
+ url = 'https://raw.githubusercontent.com/sixtyfive/arabic-types/main/types.lst'
23
+ data = URI.open(url)
24
+ IO.copy_stream data, path
25
+ end
26
+
27
+ def percentage_of_tokens_present(string)
28
+ words = string.split
29
+ n_present = words.map{|w| @types.include? w}.count(true)
30
+ 100.0 / words.length * n_present
31
+ end
32
+
33
+ def self.percentage_of_tokens_present(string)
34
+ new.percentage_of_tokens_present(string)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Precise
4
+ VERSION = '0.1.8'
5
+ end
data/lib/precise.rb ADDED
@@ -0,0 +1,67 @@
1
+ require 'pp'
2
+ require 'slop'
3
+ require 'yaml'
4
+ require 'tiny_color'
5
+
6
+ deps = %w[version debugging error_classes core_extensions transcription transcription_r2a transcription_a2r types_list]
7
+ deps.each{|d| require_relative File.join(__dir__,'..','lib','precise',d)}
8
+
9
+ module Precise
10
+ class CLI
11
+ def initialize
12
+ opts = Slop::Options.new
13
+ opts.banner = "Usage: precise [options] <string(s)>\n"
14
+ opts.separator " where options can be:\n"
15
+ alif_variants = Precise::Transcription::AlifVariants
16
+ opts.bool "-s", "--show-rules", "print the list of rules which are applied for transcription"
17
+ opts.bool "-c", "--confidence", "also print the percentage of output words appearing in a large corpus of Arabic"
18
+ opts.bool "-A", "--no-alif-variants", "all of #{alif_variants.join("، ")} will be merged into ا"
19
+ opts.bool "-T", "--no-tashkeel", "diacritics (and non printables, such as tatweel) will be removed from output"
20
+ opts.bool "-P", "--no-punctuation", "all punctuation characters will be discarded from output"
21
+ opts.bool "-v", "--verbose", "instruct the backend classes to output debugging and plausibility information"
22
+ opts.bool "-h", "--help", "display this message"
23
+ opts.separator "\n Transcription direction is determined by presence of characters from the 'Arabic' Unicode block.\n" \
24
+ " At present, Arabic-to-Roman transcription is only rudimentary."
25
+ opts = Slop::Parser.new(opts)
26
+
27
+ begin
28
+ @opts = opts.parse(ARGV)
29
+ usage if @opts[:help] || ARGV.size == 0
30
+ rules if @opts.to_h[:show_rules]
31
+ rescue
32
+ @opts = opts.parse([])
33
+ usage
34
+ end
35
+
36
+ options = {verbose: @opts[:verbose]}
37
+ options[:alif_variants] = false if @opts.to_h[:no_alif_variants]
38
+ options[:tashkeel] = false if @opts.to_h[:no_tashkeel]
39
+ options[:punctuation] = false if @opts.to_h[:no_punctuation]
40
+
41
+ instr = @opts.arguments.join(' ')
42
+ if instr.match?(/\p{Arabic}/)
43
+ outstr = Precise::Transcription.transcribe(instr.dup, options)
44
+ else
45
+ outstr = Precise::Transcription.reverse(instr.dup, options)
46
+ outstr += " (#{Precise::TypesList::percentage_of_tokens_present(outstr)}%)" if @opts[:confidence]
47
+ end
48
+ puts outstr.pretty_inspect.gsub(/(^"|"$)/, "").strip
49
+ end
50
+
51
+ def usage
52
+ warn @opts
53
+ exit
54
+ end
55
+
56
+ def rules
57
+ puts Precise::Transcription::R2ATables.map{|k,v| Hash[k.to_s,v.map{|kk,vv| Hash[kk.to_s,vv]}]}.to_yaml.gsub(/---\n/,'')
58
+ exit
59
+ end
60
+
61
+ def nopts
62
+ @opts.to_h.values.map { |o| o || nil }.compact.size
63
+ end
64
+
65
+ def self.start; new; end
66
+ end
67
+ end
data/precise.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/precise/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'precise'
7
+ spec.version = Precise::VERSION
8
+ spec.authors = ['J. R. Schmid']
9
+ spec.email = ['jrs+git@weitnahbei.de']
10
+
11
+ spec.summary = 'Arabic to DMG-like (but more precise) and back'
12
+ spec.description = 'Romanise Arabic script, arabicise romanisations of Arabic script back into Latin script '
13
+ spec.homepage = 'https://rubygems.org/gems/precise'
14
+ spec.required_ruby_version = '>= 2.7.0'
15
+
16
+ spec.metadata['homepage_uri'] = spec.homepage
17
+ spec.metadata['source_code_uri'] = 'https://github.com/sixtyfive/precise.git'
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
24
+ end
25
+ end
26
+ spec.bindir = 'exe'
27
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ['lib']
29
+
30
+ # dependencies
31
+
32
+ spec.add_dependency 'slop'
33
+ spec.add_dependency 'tiny_color'
34
+ spec.add_dependency 'progressbar'
35
+ end
metadata ADDED
@@ -0,0 +1,105 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: precise
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.8
5
+ platform: ruby
6
+ authors:
7
+ - J. R. Schmid
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-03-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: slop
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: tiny_color
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: progressbar
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: 'Romanise Arabic script, arabicise romanisations of Arabic script back
56
+ into Latin script '
57
+ email:
58
+ - jrs+git@weitnahbei.de
59
+ executables:
60
+ - precise
61
+ extensions: []
62
+ extra_rdoc_files: []
63
+ files:
64
+ - Gemfile
65
+ - Gemfile.lock
66
+ - LICENSE.md
67
+ - README.md
68
+ - Rakefile
69
+ - TODO.md
70
+ - exe/precise
71
+ - lib/precise.rb
72
+ - lib/precise/core_extensions.rb
73
+ - lib/precise/debugging.rb
74
+ - lib/precise/error_classes.rb
75
+ - lib/precise/transcription.rb
76
+ - lib/precise/transcription_a2r.rb
77
+ - lib/precise/transcription_r2a.rb
78
+ - lib/precise/types_list.rb
79
+ - lib/precise/version.rb
80
+ - precise.gemspec
81
+ homepage: https://rubygems.org/gems/precise
82
+ licenses: []
83
+ metadata:
84
+ homepage_uri: https://rubygems.org/gems/precise
85
+ source_code_uri: https://github.com/sixtyfive/precise.git
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: 2.7.0
95
+ required_rubygems_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ requirements: []
101
+ rubygems_version: 3.3.25
102
+ signing_key:
103
+ specification_version: 4
104
+ summary: Arabic to DMG-like (but more precise) and back
105
+ test_files: []