sanscript 0.8.1 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +14 -2
- data/.travis.yml +3 -3
- data/Cargo.toml +4 -4
- data/Gemfile +1 -0
- data/Rakefile +1 -0
- data/lib/sanscript.rb +1 -0
- data/lib/sanscript/benchmark.rb +31 -62
- data/lib/sanscript/detect.rb +3 -0
- data/lib/sanscript/detect/constants.rb +5 -15
- data/lib/sanscript/detect/ruby24.rb +2 -5
- data/lib/sanscript/detect/ruby2x.rb +2 -5
- data/lib/sanscript/rust.rb +3 -0
- data/lib/sanscript/transliterate.rb +6 -6
- data/lib/sanscript/version.rb +2 -1
- data/sanscript.gemspec +8 -7
- data/src/detect/mod.rs +27 -33
- data/src/lib.rs +30 -0
- metadata +31 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ef6e9b7e12549b07177b807613493b8bbf5df8dab1d762d6f2cbb5db65c3a710
|
4
|
+
data.tar.gz: 9e02c19c6189ca48c95a74b6448cb5f0a2be25fa5fabea8a39082a52d4a8bb41
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 11f53ad681749ca4aed397e3401f40c9f97ef1e3f654d3f23c9dde407636739817d6af962f9ed09d18a4375021822e5fbbb99b38790b011986e362e2c4758a8f
|
7
|
+
data.tar.gz: 1296c18342ff547c831549e3b17412541a7afe18c471e6f0e1867b1f0bb0c24bc3c00e52a5e0d69241936f3ebb576504dcc5cfea4f8fcfc1f1a4484576578fc2
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -1,6 +1,5 @@
|
|
1
|
-
require: rubocop-rspec
|
2
1
|
AllCops:
|
3
|
-
TargetRubyVersion: 2.
|
2
|
+
TargetRubyVersion: 2.2
|
4
3
|
|
5
4
|
# Metrics
|
6
5
|
Metrics/AbcSize:
|
@@ -17,6 +16,10 @@ Metrics/BlockNesting:
|
|
17
16
|
Enabled: true
|
18
17
|
Max: 4
|
19
18
|
|
19
|
+
Metrics/BlockLength:
|
20
|
+
Exclude:
|
21
|
+
- spec/**/*.rb
|
22
|
+
|
20
23
|
Metrics/ClassLength:
|
21
24
|
Description: 'Avoid classes longer than 250 lines of code.'
|
22
25
|
Enabled: true
|
@@ -96,3 +99,12 @@ Style/TrailingCommaInLiteral:
|
|
96
99
|
Style/TrivialAccessors:
|
97
100
|
ExactNameMatch: true
|
98
101
|
AllowPredicates: true
|
102
|
+
|
103
|
+
# RSpec
|
104
|
+
require: rubocop-rspec
|
105
|
+
|
106
|
+
RSpec/ContextWording:
|
107
|
+
Enabled: false
|
108
|
+
|
109
|
+
RSpec/NestedGroups:
|
110
|
+
Max: 3
|
data/.travis.yml
CHANGED
data/Cargo.toml
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
[package]
|
2
2
|
name = "rusty_sanscript"
|
3
|
-
version = "0.4.
|
3
|
+
version = "0.4.1"
|
4
4
|
authors = ["Tim Bellefleur <nomoon@phoebus.ca>"]
|
5
5
|
publish = false
|
6
6
|
|
7
7
|
[dependencies]
|
8
|
-
lazy_static = "^0.2.
|
9
|
-
ruby-sys = "^0.
|
10
|
-
regex = "^0.
|
8
|
+
lazy_static = "^0.2.10"
|
9
|
+
ruby-sys = "^0.3.0"
|
10
|
+
regex = "^0.2.2"
|
11
11
|
|
12
12
|
[lib]
|
13
13
|
crate-type = ["cdylib"]
|
data/Gemfile
CHANGED
data/Rakefile
CHANGED
data/lib/sanscript.rb
CHANGED
data/lib/sanscript/benchmark.rb
CHANGED
@@ -5,7 +5,7 @@ module Sanscript
|
|
5
5
|
require "benchmark/ips"
|
6
6
|
rescue LoadError
|
7
7
|
#:nocov:
|
8
|
-
module ::Benchmark
|
8
|
+
module ::Benchmark # rubocop:disable Style/ClassAndModuleChildren
|
9
9
|
def self.ips(*)
|
10
10
|
raise NotImplementedError, "You must install the `benchmark-ips` gem first."
|
11
11
|
end
|
@@ -15,32 +15,32 @@ module Sanscript
|
|
15
15
|
|
16
16
|
# Benchmark/testing module.
|
17
17
|
module Benchmark
|
18
|
+
TEST_STRINGS = {
|
19
|
+
brahmic: {
|
20
|
+
devanagari: "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः",
|
21
|
+
malayalam: "നാനാശാസ്ത്രസുഭാഷിതാമൃതരസൈഃ ശ്രോത്രോത്സവം കുര്വതാം യേഷാം യാന്തി ദിനാനി പണ്ഡിതജനവ്യായാമഖിന്നാത്മനാമ് തേഷാം ജന്മ ച ജീവിതം ച സുകൃതം തൈര് ഏവ ഭൂര് ഭൂഷിതാ ശേഷൈഹ് കിം പശുവദ് വിവേകരഹിതൈര് ഭൂഭാരഭൂതൈര് നരഃ",
|
22
|
+
}.freeze,
|
23
|
+
roman: {
|
24
|
+
iast: "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ",
|
25
|
+
slp1: "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH",
|
26
|
+
hk: "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH",
|
27
|
+
}.freeze,
|
28
|
+
}.freeze
|
29
|
+
|
30
|
+
TEST_STRINGS_FLAT = TEST_STRINGS.reduce({}) { |a, (_, v)| a.merge(v) }.freeze
|
31
|
+
|
32
|
+
private_constant :TEST_STRINGS, :TEST_STRINGS_FLAT
|
33
|
+
|
18
34
|
module_function
|
19
35
|
|
20
36
|
# Runs benchmark-ips test on detection methods.
|
21
37
|
def detect!(time = 2, warmup = 1)
|
22
|
-
deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
|
23
|
-
malayalam_string = "നാനാശാസ്ത്രസുഭാഷിതാമൃതരസൈഃ ശ്രോത്രോത്സവം കുര്വതാം യേഷാം യാന്തി ദിനാനി പണ്ഡിതജനവ്യായാമഖിന്നാത്മനാമ് തേഷാം ജന്മ ച ജീവിതം ച സുകൃതം തൈര് ഏവ ഭൂര് ഭൂഷിതാ ശേഷൈഹ് കിം പശുവദ് വിവേകരഹിതൈര് ഭൂഭാരഭൂതൈര് നരഃ"
|
24
|
-
iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
|
25
|
-
slp1_string = "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"
|
26
|
-
hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
|
27
|
-
|
28
38
|
::Benchmark.ips do |x|
|
29
39
|
x.config(time: time, warmup: warmup)
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
Sanscript::Detect.detect_scheme(malayalam_string)
|
35
|
-
end
|
36
|
-
x.report("Detect IAST") do
|
37
|
-
Sanscript::Detect.detect_scheme(iast_string)
|
38
|
-
end
|
39
|
-
x.report("Detect SLP1") do
|
40
|
-
Sanscript::Detect.detect_scheme(slp1_string)
|
41
|
-
end
|
42
|
-
x.report("Detect HK") do
|
43
|
-
Sanscript::Detect.detect_scheme(hk_string)
|
40
|
+
TEST_STRINGS_FLAT.each do |scheme, string|
|
41
|
+
x.report("Detect #{scheme}") do
|
42
|
+
Sanscript::Detect.detect_scheme(string)
|
43
|
+
end
|
44
44
|
end
|
45
45
|
x.compare!
|
46
46
|
end
|
@@ -49,39 +49,13 @@ module Sanscript
|
|
49
49
|
|
50
50
|
# Runs benchmark-ips test on roman-source transliteration methods.
|
51
51
|
def transliterate_roman!(time = 2, warmup = 1)
|
52
|
-
iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
|
53
|
-
slp1_string = "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"
|
54
|
-
hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
|
55
|
-
|
56
52
|
::Benchmark.ips do |x|
|
57
53
|
x.config(time: time, warmup: warmup)
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
Sanscript.transliterate(iast_string, :iast, :slp1)
|
64
|
-
end
|
65
|
-
x.report("IAST==>SLP1") do
|
66
|
-
Sanscript.transliterate(iast_string, :iast, :hk)
|
67
|
-
end
|
68
|
-
x.report("SLP1==>Devanagari") do
|
69
|
-
Sanscript.transliterate(slp1_string, :slp1, :devanagari)
|
70
|
-
end
|
71
|
-
x.report("SLP1==>IAST") do
|
72
|
-
Sanscript.transliterate(slp1_string, :slp1, :iast)
|
73
|
-
end
|
74
|
-
x.report("SLP1==>HK") do
|
75
|
-
Sanscript.transliterate(slp1_string, :slp1, :hk)
|
76
|
-
end
|
77
|
-
x.report("HK==>Devanagari") do
|
78
|
-
Sanscript.transliterate(hk_string, :hk, :devanagari)
|
79
|
-
end
|
80
|
-
x.report("HK==>IAST") do
|
81
|
-
Sanscript.transliterate(hk_string, :hk, :iast)
|
82
|
-
end
|
83
|
-
x.report("HK==>SLP1") do
|
84
|
-
Sanscript.transliterate(hk_string, :hk, :slp1)
|
54
|
+
TEST_STRINGS[:roman].to_a.product(TEST_STRINGS_FLAT.keys).each do |(ak, av), bk|
|
55
|
+
next if ak == bk
|
56
|
+
x.report("#{ak} => #{bk}") do
|
57
|
+
Sanscript.transliterate(av, ak, bk)
|
58
|
+
end
|
85
59
|
end
|
86
60
|
x.compare!
|
87
61
|
end
|
@@ -90,18 +64,13 @@ module Sanscript
|
|
90
64
|
|
91
65
|
# Runs benchmark-ips test on brahmic-source transliteration methods.
|
92
66
|
def transliterate_brahmic!(time = 2, warmup = 1)
|
93
|
-
deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
|
94
|
-
|
95
67
|
::Benchmark.ips do |x|
|
96
68
|
x.config(time: time, warmup: warmup)
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
end
|
103
|
-
x.report("Devanagari==>HK") do
|
104
|
-
Sanscript.transliterate(deva_string, :devanagari, :hk)
|
69
|
+
TEST_STRINGS[:brahmic].to_a.product(TEST_STRINGS_FLAT.keys).each do |(ak, av), bk|
|
70
|
+
next if ak == bk
|
71
|
+
x.report("#{ak} => #{bk}") do
|
72
|
+
Sanscript.transliterate(av, ak, bk)
|
73
|
+
end
|
105
74
|
end
|
106
75
|
x.compare!
|
107
76
|
end
|
data/lib/sanscript/detect.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require "sanscript/detect/constants"
|
3
4
|
|
4
5
|
module Sanscript
|
@@ -6,11 +7,13 @@ module Sanscript
|
|
6
7
|
# Developed from code available @ https://github.com/sanskrit/detect.js
|
7
8
|
module Detect
|
8
9
|
if Regexp.method_defined?(:match?)
|
10
|
+
# :nocov:
|
9
11
|
require "sanscript/detect/ruby24"
|
10
12
|
extend Ruby24
|
11
13
|
else
|
12
14
|
require "sanscript/detect/ruby2x"
|
13
15
|
extend Ruby2x
|
16
|
+
# :nocov:
|
14
17
|
end
|
15
18
|
|
16
19
|
# @!method detect_scheme(text)
|
@@ -8,24 +8,14 @@ module Sanscript
|
|
8
8
|
# between Devanagari and Malayalam.
|
9
9
|
RE_BRAHMIC_RANGE = /[\u0900-\u0d7f]/
|
10
10
|
|
11
|
-
#
|
12
|
-
|
13
|
-
devanagari: /\p{Devanagari}/,
|
14
|
-
bengali: /\p{Bengali}/,
|
15
|
-
gurmukhi: /\p{Gurmukhi}/,
|
16
|
-
gujarati: /\p{Gujarati}/,
|
17
|
-
oriya: /\p{Oriya}/,
|
18
|
-
tamil: /\p{Tamil}/,
|
19
|
-
telugu: /\p{Telugu}/,
|
20
|
-
kannada: /\p{Kannada}/,
|
21
|
-
malayalam: /\p{Malayalam}/,
|
22
|
-
}.freeze
|
11
|
+
# The order of individual brahmic scripts in 128 character unicode blocks.
|
12
|
+
BRAHMIC_SCRIPTS_ORDER = %i[devanagari bengali gurmukhi gujarati oriya tamil telugu kannada malayalam].freeze
|
23
13
|
|
24
14
|
# Match on special Roman characters
|
25
|
-
RE_IAST_OR_KOLKATA_ONLY = /[
|
15
|
+
RE_IAST_OR_KOLKATA_ONLY = /[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻĀĪŪṚṜḶḸĒŌṂḤṄÑṬḌṆŚṢḺ]|[aiueoAIUEO]\u0304|[rlRL]\u0323\u0304?|[mhtdMHTD]\u0323|[nN][\u0307\u0303\u0323]|[sS][\u0301\u0323]|[lL]\u0331/
|
26
16
|
|
27
17
|
# Match on Kolkata-specific Roman characters
|
28
|
-
RE_KOLKATA_ONLY = /[
|
18
|
+
RE_KOLKATA_ONLY = /[ēōĒŌ]|[eoEO]\u0304/
|
29
19
|
|
30
20
|
# Match on ITRANS-only
|
31
21
|
RE_ITRANS_ONLY = /ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a/
|
@@ -45,7 +35,7 @@ module Sanscript
|
|
45
35
|
# Match ##...## or {#...#} control blocks.
|
46
36
|
RE_CONTROL_BLOCK = /(?<!\\)##.*?(?<!\\)##|(?<!\\)\{#.*?(?<!\\)#\}/
|
47
37
|
|
48
|
-
private_constant :RE_BRAHMIC_RANGE, :
|
38
|
+
private_constant :RE_BRAHMIC_RANGE, :BRAHMIC_SCRIPTS_ORDER, :RE_IAST_OR_KOLKATA_ONLY,
|
49
39
|
:RE_KOLKATA_ONLY, :RE_ITRANS_ONLY, :RE_SLP1_ONLY, :RE_VELTHUIS_ONLY,
|
50
40
|
:RE_ITRANS_OR_VELTHUIS_ONLY, :RE_HARVARD_KYOTO, :RE_CONTROL_BLOCK
|
51
41
|
end
|
@@ -14,11 +14,8 @@ module Sanscript
|
|
14
14
|
text = text.to_str.gsub(RE_CONTROL_BLOCK, "")
|
15
15
|
|
16
16
|
# Brahmic schemes are all within a specific range of code points.
|
17
|
-
|
18
|
-
|
19
|
-
return script if regex.match?(text)
|
20
|
-
end
|
21
|
-
end
|
17
|
+
brahmic_char = text[RE_BRAHMIC_RANGE]
|
18
|
+
return BRAHMIC_SCRIPTS_ORDER[(brahmic_char.ord - 0x0900) / 0x80] if brahmic_char
|
22
19
|
|
23
20
|
# Romanizations
|
24
21
|
if RE_IAST_OR_KOLKATA_ONLY.match?(text)
|
@@ -15,11 +15,8 @@ module Sanscript
|
|
15
15
|
# rubocop:disable Style/CaseEquality
|
16
16
|
|
17
17
|
# Brahmic schemes are all within a specific range of code points.
|
18
|
-
|
19
|
-
|
20
|
-
return script if regex === text
|
21
|
-
end
|
22
|
-
end
|
18
|
+
brahmic_char = text[RE_BRAHMIC_RANGE]
|
19
|
+
return BRAHMIC_SCRIPTS_ORDER[(brahmic_char.ord - 0x0900) / 0x80] if brahmic_char
|
23
20
|
|
24
21
|
# Romanizations
|
25
22
|
if RE_IAST_OR_KOLKATA_ONLY === text
|
data/lib/sanscript/rust.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Sanscript
|
3
4
|
module_function
|
4
5
|
|
5
6
|
# Attempts to load Rust native extension.
|
6
7
|
# @return [bool] whether the extension loaded.
|
7
8
|
def rust_load!
|
9
|
+
# :nocov:
|
8
10
|
return RUST_AVAILABLE if defined?(RUST_AVAILABLE)
|
9
11
|
require "thermite/fiddle"
|
10
12
|
Thermite::Fiddle.load_module("init_rusty_sanscript",
|
@@ -13,6 +15,7 @@ module Sanscript
|
|
13
15
|
defined?(Sanscript::Rust) ? true : false
|
14
16
|
rescue Fiddle::DLError
|
15
17
|
false
|
18
|
+
# :nocov:
|
16
19
|
end
|
17
20
|
|
18
21
|
# @return [bool] the enabled status of the Rust extension
|
@@ -128,7 +128,7 @@ module Sanscript
|
|
128
128
|
add_roman_scheme(:itrans_dravidian, itrans_dravidian)
|
129
129
|
|
130
130
|
# ensure deep freeze on alternates
|
131
|
-
@all_alternates.
|
131
|
+
@all_alternates.each_value { |alternates| alternates.deep_freeze } # rubocop:disable Style/SymbolProc
|
132
132
|
end
|
133
133
|
|
134
134
|
# Transliterate from one script to another.
|
@@ -195,14 +195,14 @@ module Sanscript
|
|
195
195
|
token_lengths.push(f.length)
|
196
196
|
token_lengths.concat(alts.map(&:length))
|
197
197
|
|
198
|
-
if group == :vowel_marks || group == :virama
|
198
|
+
if group == :vowel_marks || group == :virama # rubocop:disable MultipleComparison
|
199
199
|
marks[f] = t
|
200
200
|
alts.each { |alt| marks[alt] = t }
|
201
201
|
else
|
202
202
|
letters[f] = t
|
203
203
|
alts.each { |alt| letters[alt] = t }
|
204
204
|
|
205
|
-
if group == :consonants || group == :other
|
205
|
+
if group == :consonants || group == :other # rubocop:disable MultipleComparison
|
206
206
|
consonants[f] = t
|
207
207
|
alts.each { |alt| consonants[alt] = t }
|
208
208
|
end
|
@@ -227,7 +227,7 @@ module Sanscript
|
|
227
227
|
# @param data [String] the string to transliterate
|
228
228
|
# @param map [Hash] map data generated from {#make_map}
|
229
229
|
# @return [String] the transliterated string
|
230
|
-
def transliterate_roman(data, map, options = {})
|
230
|
+
def transliterate_roman(data, map, options = {}) # rubocop:disable MethodLength, CyclomaticComplexity
|
231
231
|
data = data.to_str.chars
|
232
232
|
buf = []
|
233
233
|
token_buffer = []
|
@@ -239,7 +239,7 @@ module Sanscript
|
|
239
239
|
until data.empty? && token_buffer.empty?
|
240
240
|
# Match all token substrings to our map.
|
241
241
|
token = data[0, max_token_length].join("")
|
242
|
-
max_token_length.downto(1) do |j|
|
242
|
+
max_token_length.downto(1) do |j| # rubocop:disable BlockLength
|
243
243
|
token = token[0, j] unless j == max_token_length
|
244
244
|
if j == 2
|
245
245
|
if !control_char && token == "##"
|
@@ -302,7 +302,7 @@ module Sanscript
|
|
302
302
|
# @param data [String] the string to transliterate
|
303
303
|
# @param map [Hash] map data generated from {#make_map}
|
304
304
|
# @return [String] the transliterated string
|
305
|
-
def transliterate_brahmic(data, map)
|
305
|
+
def transliterate_brahmic(data, map) # rubocop:disable MethodLength, CyclomaticComplexity
|
306
306
|
data = data.to_str.chars
|
307
307
|
buf = []
|
308
308
|
had_roman_consonant = false
|
data/lib/sanscript/version.rb
CHANGED
data/sanscript.gemspec
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
# coding: utf-8
|
3
|
+
|
3
4
|
lib = File.expand_path("../lib", __FILE__)
|
4
5
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
6
|
require "sanscript/version"
|
@@ -22,16 +23,16 @@ Gem::Specification.new do |spec|
|
|
22
23
|
|
23
24
|
spec.required_ruby_version = "~> 2.2"
|
24
25
|
|
26
|
+
spec.add_development_dependency "benchmark-ips", "~> 2.7"
|
25
27
|
spec.add_development_dependency "bundler", "~> 1.13"
|
26
|
-
spec.add_development_dependency "rake", "~> 12"
|
27
|
-
spec.add_development_dependency "rspec", "~> 3.5"
|
28
28
|
spec.add_development_dependency "coveralls", "~> 0.8"
|
29
|
-
spec.add_development_dependency "rubocop", "~> 0.43"
|
30
|
-
spec.add_development_dependency "rubocop-rspec", "~> 1.7"
|
31
29
|
spec.add_development_dependency "pry", "~> 0.10"
|
32
|
-
spec.add_development_dependency "
|
30
|
+
spec.add_development_dependency "rake", "~> 12"
|
31
|
+
spec.add_development_dependency "rspec", "~> 3.5"
|
32
|
+
spec.add_development_dependency "rubocop", "~> 0.51"
|
33
|
+
spec.add_development_dependency "rubocop-rspec", "~> 1.20"
|
33
34
|
spec.add_development_dependency "yard", "~> 0.9"
|
34
35
|
|
35
|
-
spec.add_runtime_dependency "ragabash", "~> 0.
|
36
|
-
spec.add_runtime_dependency "thermite", "~> 0.
|
36
|
+
spec.add_runtime_dependency "ragabash", "~> 0.3"
|
37
|
+
spec.add_runtime_dependency "thermite", "~> 0.13"
|
37
38
|
end
|
data/src/detect/mod.rs
CHANGED
@@ -14,13 +14,13 @@ lazy_static! {
|
|
14
14
|
|
15
15
|
// Match any character in the block of Brahmic scripts
|
16
16
|
// between Devanagari and Malayalam.
|
17
|
-
static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{
|
17
|
+
static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0D7F}]").unwrap();
|
18
18
|
|
19
19
|
// Match on special Roman characters
|
20
|
-
static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"
|
20
|
+
static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻĀĪŪṚṜḶḸĒŌṂḤṄÑṬḌṆŚṢḺ]|[aiueoAIUEO]\x{0304}|[rlRL]\x{0323}\x{0304}?|[mhtdMHTD]\x{0323}|[nN][\x{0307}\x{0303}\x{0323}]|[sS][\x{0301}\x{0323}]|[lL]\x{0331}").unwrap();
|
21
21
|
|
22
22
|
// Match on Kolkata-specific Roman characters
|
23
|
-
static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"
|
23
|
+
static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"[ēōĒŌ]|[eoEO]\x{0304}").unwrap();
|
24
24
|
|
25
25
|
// Match on ITRANS-only
|
26
26
|
static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
|
@@ -44,53 +44,47 @@ lazy_static! {
|
|
44
44
|
#[no_mangle]
|
45
45
|
pub extern fn detect_scheme(s: &str) -> usize {
|
46
46
|
// Clean-up string of control characters.
|
47
|
-
let r_str = &
|
48
|
-
|
47
|
+
let r_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(s, "");
|
48
|
+
let r_str = &RE_CONTROL_BLOCK.replace_all(r_str, "");
|
49
49
|
|
50
50
|
// Brahmic schemes are all within a specific range of code points.
|
51
51
|
let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
|
52
52
|
if brahmic_match != None {
|
53
53
|
let brahmic_match = brahmic_match.unwrap();
|
54
|
-
let brahmic_codepoint = r_str.chars().nth(brahmic_match.
|
54
|
+
let brahmic_codepoint = r_str.chars().nth(brahmic_match.start()).unwrap() as usize;
|
55
55
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
return 6; // Tamil
|
68
|
-
} else if brahmic_codepoint < 0x0C80 {
|
69
|
-
return 7; // Telugu
|
70
|
-
} else if brahmic_codepoint < 0x0D00 {
|
71
|
-
return 8; // Kannada
|
72
|
-
} else {
|
73
|
-
return 9; // Malayalam
|
56
|
+
return match brahmic_codepoint {
|
57
|
+
0x0900...0x097F => 1, // Devanagari
|
58
|
+
0x0980...0x09FF => 2, // Bengali
|
59
|
+
0x0A00...0x0A7F => 3, // Gurmukhi
|
60
|
+
0x0A80...0x0AFF => 4, // Gujarati
|
61
|
+
0x0B00...0x0B7F => 5, // Oriya
|
62
|
+
0x0B80...0x0BFF => 6, // Tamil
|
63
|
+
0x0C00...0x0C7F => 7, // Telugu
|
64
|
+
0x0C80...0x0CFF => 8, // Kannada
|
65
|
+
0x0D00...0x0D7F => 9, // Malayalam
|
66
|
+
_ => 0
|
74
67
|
}
|
75
68
|
}
|
76
69
|
|
77
70
|
// Romanizations
|
78
|
-
if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
|
71
|
+
return if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
|
79
72
|
if RE_KOLKATA_ONLY.is_match(r_str) {
|
80
|
-
|
73
|
+
11 // Kolkata
|
81
74
|
} else {
|
82
|
-
|
75
|
+
10 // IAST
|
83
76
|
}
|
84
77
|
} else if RE_ITRANS_ONLY.is_match(r_str) {
|
85
|
-
|
78
|
+
12 // ITRANS
|
86
79
|
} else if RE_SLP1_ONLY.is_match(r_str) {
|
87
|
-
|
80
|
+
13 // SLP1
|
88
81
|
} else if RE_VELTHUIS_ONLY.is_match(r_str) {
|
89
|
-
|
82
|
+
14 // Velthuis
|
90
83
|
} else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
|
91
|
-
|
84
|
+
12 // ITRANS
|
92
85
|
} else if RE_HARVARD_KYOTO.is_match(r_str) {
|
93
|
-
|
86
|
+
15 // HK
|
87
|
+
} else {
|
88
|
+
0 // Unknown
|
94
89
|
}
|
95
|
-
return 0; // Unknown
|
96
90
|
}
|
data/src/lib.rs
CHANGED
@@ -4,3 +4,33 @@ extern crate regex;
|
|
4
4
|
|
5
5
|
#[macro_use] mod rb;
|
6
6
|
pub mod detect;
|
7
|
+
|
8
|
+
#[cfg(test)]
|
9
|
+
mod tests {
|
10
|
+
use detect::detect_scheme;
|
11
|
+
|
12
|
+
#[test]
|
13
|
+
fn detect_devanagari() {
|
14
|
+
assert_eq!(detect_scheme("नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"), 1);
|
15
|
+
}
|
16
|
+
|
17
|
+
#[test]
|
18
|
+
fn detect_malayalam() {
|
19
|
+
assert_eq!(detect_scheme("നാനാശാസ്ത്രസുഭാഷിതാമൃതരസൈഃ ശ്രോത്രോത്സവം കുര്വതാം യേഷാം യാന്തി ദിനാനി പണ്ഡിതജനവ്യായാമഖിന്നാത്മനാമ് തേഷാം ജന്മ ച ജീവിതം ച സുകൃതം തൈര് ഏവ ഭൂര് ഭൂഷിതാ ശേഷൈഹ് കിം പശുവദ് വിവേകരഹിതൈര് ഭൂഭാരഭൂതൈര് നരഃ"), 9);
|
20
|
+
}
|
21
|
+
|
22
|
+
#[test]
|
23
|
+
fn detect_iast() {
|
24
|
+
assert_eq!(detect_scheme("nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"), 10);
|
25
|
+
}
|
26
|
+
|
27
|
+
#[test]
|
28
|
+
fn detect_slp1() {
|
29
|
+
assert_eq!(detect_scheme("nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"), 13);
|
30
|
+
}
|
31
|
+
|
32
|
+
#[test]
|
33
|
+
fn detect_hk() {
|
34
|
+
assert_eq!(detect_scheme("nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"), 15);
|
35
|
+
}
|
36
|
+
}
|
metadata
CHANGED
@@ -1,127 +1,127 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-11-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: benchmark-ips
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.7'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.7'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '1.13'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '1.13'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: coveralls
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '0.8'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '0.8'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: pry
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0.
|
61
|
+
version: '0.10'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0.
|
68
|
+
version: '0.10'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '12'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '12'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: rspec
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
89
|
+
version: '3.5'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
96
|
+
version: '3.5'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rubocop
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '0.
|
103
|
+
version: '0.51'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '0.
|
110
|
+
version: '0.51'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
112
|
+
name: rubocop-rspec
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
117
|
+
version: '1.20'
|
118
118
|
type: :development
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: '
|
124
|
+
version: '1.20'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: yard
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,28 +142,28 @@ dependencies:
|
|
142
142
|
requirements:
|
143
143
|
- - "~>"
|
144
144
|
- !ruby/object:Gem::Version
|
145
|
-
version: '0.
|
145
|
+
version: '0.3'
|
146
146
|
type: :runtime
|
147
147
|
prerelease: false
|
148
148
|
version_requirements: !ruby/object:Gem::Requirement
|
149
149
|
requirements:
|
150
150
|
- - "~>"
|
151
151
|
- !ruby/object:Gem::Version
|
152
|
-
version: '0.
|
152
|
+
version: '0.3'
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
154
|
name: thermite
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
157
|
- - "~>"
|
158
158
|
- !ruby/object:Gem::Version
|
159
|
-
version: '0.
|
159
|
+
version: '0.13'
|
160
160
|
type: :runtime
|
161
161
|
prerelease: false
|
162
162
|
version_requirements: !ruby/object:Gem::Requirement
|
163
163
|
requirements:
|
164
164
|
- - "~>"
|
165
165
|
- !ruby/object:Gem::Version
|
166
|
-
version: '0.
|
166
|
+
version: '0.13'
|
167
167
|
description:
|
168
168
|
email:
|
169
169
|
- nomoon@phoebus.ca
|
@@ -222,7 +222,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
222
222
|
version: '0'
|
223
223
|
requirements: []
|
224
224
|
rubyforge_project:
|
225
|
-
rubygems_version: 2.
|
225
|
+
rubygems_version: 2.7.2
|
226
226
|
signing_key:
|
227
227
|
specification_version: 4
|
228
228
|
summary: Ruby port and extension of Sanscript.js transliterator by learnsanskrit.org
|