sanscript 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sanscript/benchmark.rb +49 -23
- data/lib/sanscript/detect.rb +14 -11
- data/lib/sanscript/transliterate.rb +27 -11
- data/lib/sanscript/transliterate/schemes.rb +7 -1
- data/lib/sanscript/version.rb +1 -1
- data/sanscript.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e9133bd05cf7deb7c03cec7e4c9d46bac3eabaf
|
4
|
+
data.tar.gz: e36b9dfe888ee1070d674d72a766130527ce603b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 651f1f92402d0b8507ffeda3df3d93ca9c16316f37f0903cb4034efda2a1e5d92df9c9f40810e34f43466de84e15c22d771f73404248811846b6249f5eefa4ba
|
7
|
+
data.tar.gz: c79cc7021d15d791663faf6d3895bea4e80981c5f6061e780d4d38e057df3f6b1e89ff4eab5fcf8298a889f15d2c9289cf49074a7206fbd2ecb4c75444a32c7b
|
data/lib/sanscript/benchmark.rb
CHANGED
@@ -1,53 +1,79 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "sanscript/refinements"
|
4
|
-
|
4
|
+
begin
|
5
|
+
require "benchmark/ips"
|
6
|
+
rescue LoadError
|
7
|
+
module Benchmark
|
8
|
+
def self.ips(*)
|
9
|
+
raise NotImplementedError, "You must install the `benchmark-ips` gem first."
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
5
13
|
|
6
14
|
module Sanscript
|
7
15
|
using Refinements
|
8
16
|
module Benchmark
|
9
17
|
module_function
|
10
18
|
|
11
|
-
def
|
12
|
-
n = 100_000
|
19
|
+
def detect!
|
13
20
|
iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
|
14
21
|
deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
|
22
|
+
slp1_string = "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"
|
23
|
+
hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
|
24
|
+
malayalam_string = "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ എ ഏ ഐ ഒ ഓ ഔ"
|
15
25
|
|
16
|
-
::Benchmark.
|
26
|
+
::Benchmark.ips do |x|
|
27
|
+
x.config(time: 5, warmup: 1)
|
28
|
+
x.report("Detect Devanagari") do
|
29
|
+
Sanscript::Detect.detect_script(deva_string) == :devanagari
|
30
|
+
end
|
31
|
+
x.report("Detect Malayalam") do
|
32
|
+
Sanscript::Detect.detect_script(malayalam_string) == :malayalam
|
33
|
+
end
|
17
34
|
x.report("Detect IAST") do
|
18
|
-
|
35
|
+
Sanscript::Detect.detect_script(iast_string) == :iast
|
19
36
|
end
|
20
|
-
x.report("Detect
|
21
|
-
|
37
|
+
x.report("Detect SLP1") do
|
38
|
+
Sanscript::Detect.detect_script(slp1_string) == :slp1
|
22
39
|
end
|
40
|
+
x.report("Detect HK") do
|
41
|
+
Sanscript::Detect.detect_script(hk_string) == :hk
|
42
|
+
end
|
43
|
+
x.compare!
|
23
44
|
end
|
45
|
+
true
|
24
46
|
end
|
25
47
|
|
26
|
-
def
|
27
|
-
n = 5_000
|
48
|
+
def transliterate!
|
28
49
|
iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
|
29
|
-
|
30
50
|
deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
51
|
+
slp1_string = "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"
|
52
|
+
|
53
|
+
::Benchmark.ips do |x|
|
54
|
+
x.config(time: 5, warmup: 2)
|
55
|
+
|
35
56
|
x.report("IAST==>Devanagari") do
|
36
|
-
|
37
|
-
end
|
38
|
-
x.report("IAST**>SLP1") do
|
39
|
-
n.times { Sanscript.transliterate(iast_string, :slp1) }
|
57
|
+
Sanscript.transliterate(iast_string, :iast, :devanagari) == deva_string
|
40
58
|
end
|
41
59
|
x.report("IAST==>SLP1") do
|
42
|
-
|
60
|
+
Sanscript.transliterate(iast_string, :iast, :slp1) == slp1_string
|
61
|
+
end
|
62
|
+
x.report("SLP1==>Devanagari") do
|
63
|
+
Sanscript.transliterate(slp1_string, :slp1, :devanagari) == deva_string
|
64
|
+
end
|
65
|
+
x.report("SLP1==>IAST") do
|
66
|
+
Sanscript.transliterate(slp1_string, :slp1, :iast) == iast_string
|
43
67
|
end
|
44
|
-
x.report("Devanagari
|
45
|
-
|
68
|
+
x.report("Devanagari==>SLP1") do
|
69
|
+
Sanscript.transliterate(deva_string, :devanagari, :slp1) == slp1_string
|
46
70
|
end
|
47
|
-
x.report("Devanagari
|
48
|
-
|
71
|
+
x.report("Devanagari==>IAST") do
|
72
|
+
Sanscript.transliterate(deva_string, :devanagari, :iast) == iast_string
|
49
73
|
end
|
74
|
+
x.compare!
|
50
75
|
end
|
76
|
+
true
|
51
77
|
end
|
52
78
|
end
|
53
79
|
end
|
data/lib/sanscript/detect.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
# rubocop:disable Style/CaseEquality
|
3
|
+
|
2
4
|
#
|
3
5
|
# Developed from code available @ https://github.com/sanskrit/detect.js
|
4
6
|
#
|
@@ -31,7 +33,7 @@ module Sanscript
|
|
31
33
|
RE_ITRANS_ONLY = /ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a/
|
32
34
|
|
33
35
|
# Match on SLP1-only characters and bigrams
|
34
|
-
RE_SLP1_ONLY = /[fFxXEOCYwWqQPB]|kz|
|
36
|
+
RE_SLP1_ONLY = /[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]/
|
35
37
|
|
36
38
|
# Match on Velthuis-only characters
|
37
39
|
RE_VELTHUIS_ONLY = /\.[mhnrlntds]|"n|~s/
|
@@ -39,7 +41,7 @@ module Sanscript
|
|
39
41
|
# Match on chars shared by ITRANS and Velthuis
|
40
42
|
RE_ITRANS_OR_VELTHUIS_ONLY = /aa|ii|uu|~n/
|
41
43
|
|
42
|
-
# Match on characters
|
44
|
+
# Match on characters available in Harvard-Kyoto
|
43
45
|
RE_HARVARD_KYOTO = /[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]/
|
44
46
|
|
45
47
|
private_constant :RE_BRAHMIC_RANGE, :RE_BRAHMIC_SCRIPTS, :RE_IAST_OR_KOLKATA_ONLY,
|
@@ -50,24 +52,25 @@ module Sanscript
|
|
50
52
|
|
51
53
|
def detect_script(text)
|
52
54
|
# Brahmic schemes are all within a specific range of code points.
|
53
|
-
if
|
55
|
+
if RE_BRAHMIC_RANGE === text
|
54
56
|
RE_BRAHMIC_SCRIPTS.each do |script, regex|
|
55
|
-
return script if
|
57
|
+
return script if regex === text
|
56
58
|
end
|
57
59
|
end
|
58
60
|
|
59
61
|
# Romanizations
|
60
|
-
if
|
61
|
-
|
62
|
-
|
62
|
+
if RE_IAST_OR_KOLKATA_ONLY === text
|
63
|
+
return :kolkata if RE_KOLKATA_ONLY === text
|
64
|
+
:iast
|
65
|
+
elsif RE_ITRANS_ONLY === text
|
63
66
|
:itrans
|
64
|
-
elsif
|
67
|
+
elsif RE_SLP1_ONLY === text
|
65
68
|
:slp1
|
66
|
-
elsif
|
69
|
+
elsif RE_VELTHUIS_ONLY === text
|
67
70
|
:velthuis
|
68
|
-
elsif
|
71
|
+
elsif RE_ITRANS_OR_VELTHUIS_ONLY === text
|
69
72
|
:itrans
|
70
|
-
elsif
|
73
|
+
elsif RE_HARVARD_KYOTO === text
|
71
74
|
:hk
|
72
75
|
else
|
73
76
|
:unknown
|
@@ -14,7 +14,8 @@ module Sanscript
|
|
14
14
|
using Refinements
|
15
15
|
module Transliterate
|
16
16
|
class << self
|
17
|
-
attr_reader :defaults, :schemes, :
|
17
|
+
attr_reader :defaults, :schemes, :scheme_names, :brahmic_schemes, :roman_schemes,
|
18
|
+
:all_alternates
|
18
19
|
end
|
19
20
|
|
20
21
|
@defaults = {
|
@@ -27,12 +28,13 @@ module Sanscript
|
|
27
28
|
module_function
|
28
29
|
|
29
30
|
#
|
30
|
-
#
|
31
|
+
# Check whether the given scheme encodes Brahmic Sanskrit.
|
31
32
|
#
|
32
|
-
# @
|
33
|
+
# @param name the scheme name
|
34
|
+
# @return boolean
|
33
35
|
#
|
34
|
-
def
|
35
|
-
@
|
36
|
+
def brahmic_scheme?(name)
|
37
|
+
@brahmic_schemes.include?(name.to_sym)
|
36
38
|
end
|
37
39
|
|
38
40
|
#
|
@@ -64,7 +66,12 @@ module Sanscript
|
|
64
66
|
# described above.
|
65
67
|
#
|
66
68
|
def add_brahmic_scheme(name, scheme)
|
67
|
-
|
69
|
+
name = name.to_sym
|
70
|
+
scheme = scheme.deep_dup
|
71
|
+
@schemes[name] = scheme.deep_freeze
|
72
|
+
@brahmic_schemes.add(name)
|
73
|
+
@scheme_names.add(name)
|
74
|
+
scheme
|
68
75
|
end
|
69
76
|
|
70
77
|
#
|
@@ -82,6 +89,8 @@ module Sanscript
|
|
82
89
|
scheme[:vowel_marks] = scheme[:vowels][1..-1] unless scheme.key?(:vowel_marks)
|
83
90
|
@schemes[name] = scheme.deep_freeze
|
84
91
|
@roman_schemes.add(name)
|
92
|
+
@scheme_names.add(name)
|
93
|
+
scheme
|
85
94
|
end
|
86
95
|
|
87
96
|
#
|
@@ -93,15 +102,23 @@ module Sanscript
|
|
93
102
|
|
94
103
|
# Set up various schemes
|
95
104
|
begin
|
105
|
+
# Re-add existing Brahmic schemes in order to add them to `scheme_names`
|
106
|
+
# and to freeze them up.
|
107
|
+
brahmic_scheme_names = %i[bengali devanagari gujarati gurmukhi kannada malayalam
|
108
|
+
oriya tamil telugu]
|
109
|
+
brahmic_scheme_names.each do |name|
|
110
|
+
add_brahmic_scheme(name, @schemes[name])
|
111
|
+
end
|
112
|
+
|
96
113
|
# Set up roman schemes
|
97
114
|
kolkata = @schemes[:kolkata] = @schemes[:iast].deep_dup
|
98
|
-
|
115
|
+
roman_scheme_names = %i[iast itrans hk kolkata slp1 velthuis wx]
|
99
116
|
kolkata[:vowels] = %w[a ā i ī u ū ṛ ṝ ḷ ḹ e ē ai o ō au]
|
100
117
|
|
101
118
|
# These schemes already belong to Sanscript.schemes. But by adding
|
102
|
-
# them again with `
|
119
|
+
# them again with `add_roman_scheme`, we automatically build up
|
103
120
|
# `roman_schemes` and define a `vowel_marks` field for each one.
|
104
|
-
|
121
|
+
roman_scheme_names.each do |name|
|
105
122
|
add_roman_scheme(name, @schemes[name])
|
106
123
|
end
|
107
124
|
|
@@ -112,8 +129,7 @@ module Sanscript
|
|
112
129
|
@all_alternates[:itrans_dravidian] = @all_alternates[:itrans]
|
113
130
|
add_roman_scheme(:itrans_dravidian, itrans_dravidian)
|
114
131
|
|
115
|
-
# ensure deep freeze on
|
116
|
-
@schemes.each { |_, scheme| scheme.deep_freeze }
|
132
|
+
# ensure deep freeze on alternates
|
117
133
|
@all_alternates.each { |_, scheme| scheme.deep_freeze }
|
118
134
|
end
|
119
135
|
|
@@ -275,9 +275,15 @@ module Sanscript
|
|
275
275
|
},
|
276
276
|
}
|
277
277
|
|
278
|
-
# Set of names of schemes
|
278
|
+
# Set of names of Roman schemes
|
279
279
|
@roman_schemes = Set.new
|
280
280
|
|
281
|
+
# Set of names of Brahmic schemes
|
282
|
+
@brahmic_schemes = Set.new
|
283
|
+
|
284
|
+
# Set of names of all schemes
|
285
|
+
@scheme_names = Set.new
|
286
|
+
|
281
287
|
# Map of alternate encodings.
|
282
288
|
@all_alternates = {
|
283
289
|
itrans: {
|
data/lib/sanscript/version.rb
CHANGED
data/sanscript.gemspec
CHANGED
@@ -24,6 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_development_dependency "rspec", "~> 3.5"
|
25
25
|
spec.add_development_dependency "codeclimate-test-reporter", "~> 0.6"
|
26
26
|
spec.add_development_dependency "pry", "~> 0.10"
|
27
|
+
spec.add_development_dependency "benchmark-ips", "~> 2.6"
|
27
28
|
|
28
29
|
spec.add_runtime_dependency "ice_nine", "~> 0.11"
|
29
30
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-07-
|
11
|
+
date: 2016-07-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0.10'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: benchmark-ips
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '2.6'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '2.6'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: ice_nine
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|