sanscript 0.8.1 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: c50d574983e399ea044b60c2d8ece2a624d561ca
4
- data.tar.gz: 7b8b068883034401079c3671f6ca60b9f777d5df
2
+ SHA256:
3
+ metadata.gz: ef6e9b7e12549b07177b807613493b8bbf5df8dab1d762d6f2cbb5db65c3a710
4
+ data.tar.gz: 9e02c19c6189ca48c95a74b6448cb5f0a2be25fa5fabea8a39082a52d4a8bb41
5
5
  SHA512:
6
- metadata.gz: 39a77f3cc096bb673ad499539f7e8bae0e6300c5401c091391b3dd1e3305f8d7ec278d5dd16b19cf8a9ac1e4144bd3f606ff07e5d1028f198b499585bab893e9
7
- data.tar.gz: 8b9a1fbe21223ca924c62e83c487cd3ea2be68735cee2207b66970644f6a3a8c307173b8287c916286bcddc1e989867abd775b4a43e4a31b14c36bf2882b29d8
6
+ metadata.gz: 11f53ad681749ca4aed397e3401f40c9f97ef1e3f654d3f23c9dde407636739817d6af962f9ed09d18a4375021822e5fbbb99b38790b011986e362e2c4758a8f
7
+ data.tar.gz: 1296c18342ff547c831549e3b17412541a7afe18c471e6f0e1867b1f0bb0c24bc3c00e52a5e0d69241936f3ebb576504dcc5cfea4f8fcfc1f1a4484576578fc2
data/.gitignore CHANGED
@@ -8,6 +8,7 @@
8
8
  /spec/reports/
9
9
  /tmp/
10
10
  /lib/librusty_sanscript.*
11
+ /lib/rusty_sanscript.*
11
12
  /target
12
13
  mkmf.log
13
14
  Cargo.lock
@@ -1,6 +1,5 @@
1
- require: rubocop-rspec
2
1
  AllCops:
3
- TargetRubyVersion: 2.3
2
+ TargetRubyVersion: 2.2
4
3
 
5
4
  # Metrics
6
5
  Metrics/AbcSize:
@@ -17,6 +16,10 @@ Metrics/BlockNesting:
17
16
  Enabled: true
18
17
  Max: 4
19
18
 
19
+ Metrics/BlockLength:
20
+ Exclude:
21
+ - spec/**/*.rb
22
+
20
23
  Metrics/ClassLength:
21
24
  Description: 'Avoid classes longer than 250 lines of code.'
22
25
  Enabled: true
@@ -96,3 +99,12 @@ Style/TrailingCommaInLiteral:
96
99
  Style/TrivialAccessors:
97
100
  ExactNameMatch: true
98
101
  AllowPredicates: true
102
+
103
+ # RSpec
104
+ require: rubocop-rspec
105
+
106
+ RSpec/ContextWording:
107
+ Enabled: false
108
+
109
+ RSpec/NestedGroups:
110
+ Max: 3
@@ -1,9 +1,9 @@
1
1
  sudo: false
2
2
  language: ruby
3
3
  rvm:
4
- - 2.4.0
5
- - 2.3.3
6
- - 2.2.6
4
+ - 2.4.2
5
+ - 2.3.5
6
+ - 2.2.8
7
7
  - ruby-head
8
8
  env:
9
9
  - RUST=yes
data/Cargo.toml CHANGED
@@ -1,13 +1,13 @@
1
1
  [package]
2
2
  name = "rusty_sanscript"
3
- version = "0.4.0"
3
+ version = "0.4.1"
4
4
  authors = ["Tim Bellefleur <nomoon@phoebus.ca>"]
5
5
  publish = false
6
6
 
7
7
  [dependencies]
8
- lazy_static = "^0.2.2"
9
- ruby-sys = "^0.2.17"
10
- regex = "^0.1.80"
8
+ lazy_static = "^0.2.10"
9
+ ruby-sys = "^0.3.0"
10
+ regex = "^0.2.2"
11
11
 
12
12
  [lib]
13
13
  crate-type = ["cdylib"]
data/Gemfile CHANGED
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  source "https://rubygems.org"
3
4
 
4
5
  # Specify your gem's dependencies in sanscript.gemspec
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  require "bundler/gem_tasks"
3
4
 
4
5
  require "thermite/tasks"
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  require "ragabash"
3
4
 
4
5
  require "sanscript/version"
@@ -5,7 +5,7 @@ module Sanscript
5
5
  require "benchmark/ips"
6
6
  rescue LoadError
7
7
  #:nocov:
8
- module ::Benchmark
8
+ module ::Benchmark # rubocop:disable Style/ClassAndModuleChildren
9
9
  def self.ips(*)
10
10
  raise NotImplementedError, "You must install the `benchmark-ips` gem first."
11
11
  end
@@ -15,32 +15,32 @@ module Sanscript
15
15
 
16
16
  # Benchmark/testing module.
17
17
  module Benchmark
18
+ TEST_STRINGS = {
19
+ brahmic: {
20
+ devanagari: "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः",
21
+ malayalam: "നാനാശാസ്ത്രസുഭാഷിതാമൃതരസൈഃ ശ്രോത്രോത്സവം കുര്വതാം യേഷാം യാന്തി ദിനാനി പണ്ഡിതജനവ്യായാമഖിന്നാത്മനാമ് തേഷാം ജന്മ ച ജീവിതം ച സുകൃതം തൈര് ഏവ ഭൂര് ഭൂഷിതാ ശേഷൈഹ് കിം പശുവദ് വിവേകരഹിതൈര് ഭൂഭാരഭൂതൈര് നരഃ",
22
+ }.freeze,
23
+ roman: {
24
+ iast: "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ",
25
+ slp1: "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH",
26
+ hk: "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH",
27
+ }.freeze,
28
+ }.freeze
29
+
30
+ TEST_STRINGS_FLAT = TEST_STRINGS.reduce({}) { |a, (_, v)| a.merge(v) }.freeze
31
+
32
+ private_constant :TEST_STRINGS, :TEST_STRINGS_FLAT
33
+
18
34
  module_function
19
35
 
20
36
  # Runs benchmark-ips test on detection methods.
21
37
  def detect!(time = 2, warmup = 1)
22
- deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
23
- malayalam_string = "നാനാശാസ്ത്രസുഭാഷിതാമൃതരസൈഃ ശ്രോത്രോത്സവം കുര്വതാം യേഷാം യാന്തി ദിനാനി പണ്ഡിതജനവ്യായാമഖിന്നാത്മനാമ് തേഷാം ജന്മ ച ജീവിതം ച സുകൃതം തൈര് ഏവ ഭൂര് ഭൂഷിതാ ശേഷൈഹ് കിം പശുവദ് വിവേകരഹിതൈര് ഭൂഭാരഭൂതൈര് നരഃ"
24
- iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
25
- slp1_string = "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"
26
- hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
27
-
28
38
  ::Benchmark.ips do |x|
29
39
  x.config(time: time, warmup: warmup)
30
- x.report("Detect Devanagari") do
31
- Sanscript::Detect.detect_scheme(deva_string)
32
- end
33
- x.report("Detect Malayalam") do
34
- Sanscript::Detect.detect_scheme(malayalam_string)
35
- end
36
- x.report("Detect IAST") do
37
- Sanscript::Detect.detect_scheme(iast_string)
38
- end
39
- x.report("Detect SLP1") do
40
- Sanscript::Detect.detect_scheme(slp1_string)
41
- end
42
- x.report("Detect HK") do
43
- Sanscript::Detect.detect_scheme(hk_string)
40
+ TEST_STRINGS_FLAT.each do |scheme, string|
41
+ x.report("Detect #{scheme}") do
42
+ Sanscript::Detect.detect_scheme(string)
43
+ end
44
44
  end
45
45
  x.compare!
46
46
  end
@@ -49,39 +49,13 @@ module Sanscript
49
49
 
50
50
  # Runs benchmark-ips test on roman-source transliteration methods.
51
51
  def transliterate_roman!(time = 2, warmup = 1)
52
- iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
53
- slp1_string = "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"
54
- hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
55
-
56
52
  ::Benchmark.ips do |x|
57
53
  x.config(time: time, warmup: warmup)
58
-
59
- x.report("IAST==>Devanagari") do
60
- Sanscript.transliterate(iast_string, :iast, :devanagari)
61
- end
62
- x.report("IAST==>SLP1") do
63
- Sanscript.transliterate(iast_string, :iast, :slp1)
64
- end
65
- x.report("IAST==>SLP1") do
66
- Sanscript.transliterate(iast_string, :iast, :hk)
67
- end
68
- x.report("SLP1==>Devanagari") do
69
- Sanscript.transliterate(slp1_string, :slp1, :devanagari)
70
- end
71
- x.report("SLP1==>IAST") do
72
- Sanscript.transliterate(slp1_string, :slp1, :iast)
73
- end
74
- x.report("SLP1==>HK") do
75
- Sanscript.transliterate(slp1_string, :slp1, :hk)
76
- end
77
- x.report("HK==>Devanagari") do
78
- Sanscript.transliterate(hk_string, :hk, :devanagari)
79
- end
80
- x.report("HK==>IAST") do
81
- Sanscript.transliterate(hk_string, :hk, :iast)
82
- end
83
- x.report("HK==>SLP1") do
84
- Sanscript.transliterate(hk_string, :hk, :slp1)
54
+ TEST_STRINGS[:roman].to_a.product(TEST_STRINGS_FLAT.keys).each do |(ak, av), bk|
55
+ next if ak == bk
56
+ x.report("#{ak} => #{bk}") do
57
+ Sanscript.transliterate(av, ak, bk)
58
+ end
85
59
  end
86
60
  x.compare!
87
61
  end
@@ -90,18 +64,13 @@ module Sanscript
90
64
 
91
65
  # Runs benchmark-ips test on brahmic-source transliteration methods.
92
66
  def transliterate_brahmic!(time = 2, warmup = 1)
93
- deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
94
-
95
67
  ::Benchmark.ips do |x|
96
68
  x.config(time: time, warmup: warmup)
97
- x.report("Devanagari==>IAST") do
98
- Sanscript.transliterate(deva_string, :devanagari, :iast)
99
- end
100
- x.report("Devanagari==>SLP1") do
101
- Sanscript.transliterate(deva_string, :devanagari, :slp1)
102
- end
103
- x.report("Devanagari==>HK") do
104
- Sanscript.transliterate(deva_string, :devanagari, :hk)
69
+ TEST_STRINGS[:brahmic].to_a.product(TEST_STRINGS_FLAT.keys).each do |(ak, av), bk|
70
+ next if ak == bk
71
+ x.report("#{ak} => #{bk}") do
72
+ Sanscript.transliterate(av, ak, bk)
73
+ end
105
74
  end
106
75
  x.compare!
107
76
  end
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  require "sanscript/detect/constants"
3
4
 
4
5
  module Sanscript
@@ -6,11 +7,13 @@ module Sanscript
6
7
  # Developed from code available @ https://github.com/sanskrit/detect.js
7
8
  module Detect
8
9
  if Regexp.method_defined?(:match?)
10
+ # :nocov:
9
11
  require "sanscript/detect/ruby24"
10
12
  extend Ruby24
11
13
  else
12
14
  require "sanscript/detect/ruby2x"
13
15
  extend Ruby2x
16
+ # :nocov:
14
17
  end
15
18
 
16
19
  # @!method detect_scheme(text)
@@ -8,24 +8,14 @@ module Sanscript
8
8
  # between Devanagari and Malayalam.
9
9
  RE_BRAHMIC_RANGE = /[\u0900-\u0d7f]/
10
10
 
11
- # Match each individual Brahmic script.
12
- RE_BRAHMIC_SCRIPTS = {
13
- devanagari: /\p{Devanagari}/,
14
- bengali: /\p{Bengali}/,
15
- gurmukhi: /\p{Gurmukhi}/,
16
- gujarati: /\p{Gujarati}/,
17
- oriya: /\p{Oriya}/,
18
- tamil: /\p{Tamil}/,
19
- telugu: /\p{Telugu}/,
20
- kannada: /\p{Kannada}/,
21
- malayalam: /\p{Malayalam}/,
22
- }.freeze
11
+ # The order of individual brahmic scripts in 128 character unicode blocks.
12
+ BRAHMIC_SCRIPTS_ORDER = %i[devanagari bengali gurmukhi gujarati oriya tamil telugu kannada malayalam].freeze
23
13
 
24
14
  # Match on special Roman characters
25
- RE_IAST_OR_KOLKATA_ONLY = /[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]/i
15
+ RE_IAST_OR_KOLKATA_ONLY = /[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻĀĪŪṚṜḶḸĒŌṂḤṄÑṬḌṆŚṢḺ]|[aiueoAIUEO]\u0304|[rlRL]\u0323\u0304?|[mhtdMHTD]\u0323|[nN][\u0307\u0303\u0323]|[sS][\u0301\u0323]|[lL]\u0331/
26
16
 
27
17
  # Match on Kolkata-specific Roman characters
28
- RE_KOLKATA_ONLY = /[ēō]/i
18
+ RE_KOLKATA_ONLY = /[ēōĒŌ]|[eoEO]\u0304/
29
19
 
30
20
  # Match on ITRANS-only
31
21
  RE_ITRANS_ONLY = /ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a/
@@ -45,7 +35,7 @@ module Sanscript
45
35
  # Match ##...## or {#...#} control blocks.
46
36
  RE_CONTROL_BLOCK = /(?<!\\)##.*?(?<!\\)##|(?<!\\)\{#.*?(?<!\\)#\}/
47
37
 
48
- private_constant :RE_BRAHMIC_RANGE, :RE_BRAHMIC_SCRIPTS, :RE_IAST_OR_KOLKATA_ONLY,
38
+ private_constant :RE_BRAHMIC_RANGE, :BRAHMIC_SCRIPTS_ORDER, :RE_IAST_OR_KOLKATA_ONLY,
49
39
  :RE_KOLKATA_ONLY, :RE_ITRANS_ONLY, :RE_SLP1_ONLY, :RE_VELTHUIS_ONLY,
50
40
  :RE_ITRANS_OR_VELTHUIS_ONLY, :RE_HARVARD_KYOTO, :RE_CONTROL_BLOCK
51
41
  end
@@ -14,11 +14,8 @@ module Sanscript
14
14
  text = text.to_str.gsub(RE_CONTROL_BLOCK, "")
15
15
 
16
16
  # Brahmic schemes are all within a specific range of code points.
17
- if RE_BRAHMIC_RANGE.match?(text)
18
- RE_BRAHMIC_SCRIPTS.each do |script, regex|
19
- return script if regex.match?(text)
20
- end
21
- end
17
+ brahmic_char = text[RE_BRAHMIC_RANGE]
18
+ return BRAHMIC_SCRIPTS_ORDER[(brahmic_char.ord - 0x0900) / 0x80] if brahmic_char
22
19
 
23
20
  # Romanizations
24
21
  if RE_IAST_OR_KOLKATA_ONLY.match?(text)
@@ -15,11 +15,8 @@ module Sanscript
15
15
  # rubocop:disable Style/CaseEquality
16
16
 
17
17
  # Brahmic schemes are all within a specific range of code points.
18
- if RE_BRAHMIC_RANGE === text
19
- RE_BRAHMIC_SCRIPTS.each do |script, regex|
20
- return script if regex === text
21
- end
22
- end
18
+ brahmic_char = text[RE_BRAHMIC_RANGE]
19
+ return BRAHMIC_SCRIPTS_ORDER[(brahmic_char.ord - 0x0900) / 0x80] if brahmic_char
23
20
 
24
21
  # Romanizations
25
22
  if RE_IAST_OR_KOLKATA_ONLY === text
@@ -1,10 +1,12 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Sanscript
3
4
  module_function
4
5
 
5
6
  # Attempts to load Rust native extension.
6
7
  # @return [bool] whether the extension loaded.
7
8
  def rust_load!
9
+ # :nocov:
8
10
  return RUST_AVAILABLE if defined?(RUST_AVAILABLE)
9
11
  require "thermite/fiddle"
10
12
  Thermite::Fiddle.load_module("init_rusty_sanscript",
@@ -13,6 +15,7 @@ module Sanscript
13
15
  defined?(Sanscript::Rust) ? true : false
14
16
  rescue Fiddle::DLError
15
17
  false
18
+ # :nocov:
16
19
  end
17
20
 
18
21
  # @return [bool] the enabled status of the Rust extension
@@ -128,7 +128,7 @@ module Sanscript
128
128
  add_roman_scheme(:itrans_dravidian, itrans_dravidian)
129
129
 
130
130
  # ensure deep freeze on alternates
131
- @all_alternates.each { |_, scheme| scheme.deep_freeze }
131
+ @all_alternates.each_value { |alternates| alternates.deep_freeze } # rubocop:disable Style/SymbolProc
132
132
  end
133
133
 
134
134
  # Transliterate from one script to another.
@@ -195,14 +195,14 @@ module Sanscript
195
195
  token_lengths.push(f.length)
196
196
  token_lengths.concat(alts.map(&:length))
197
197
 
198
- if group == :vowel_marks || group == :virama
198
+ if group == :vowel_marks || group == :virama # rubocop:disable MultipleComparison
199
199
  marks[f] = t
200
200
  alts.each { |alt| marks[alt] = t }
201
201
  else
202
202
  letters[f] = t
203
203
  alts.each { |alt| letters[alt] = t }
204
204
 
205
- if group == :consonants || group == :other
205
+ if group == :consonants || group == :other # rubocop:disable MultipleComparison
206
206
  consonants[f] = t
207
207
  alts.each { |alt| consonants[alt] = t }
208
208
  end
@@ -227,7 +227,7 @@ module Sanscript
227
227
  # @param data [String] the string to transliterate
228
228
  # @param map [Hash] map data generated from {#make_map}
229
229
  # @return [String] the transliterated string
230
- def transliterate_roman(data, map, options = {})
230
+ def transliterate_roman(data, map, options = {}) # rubocop:disable MethodLength, CyclomaticComplexity
231
231
  data = data.to_str.chars
232
232
  buf = []
233
233
  token_buffer = []
@@ -239,7 +239,7 @@ module Sanscript
239
239
  until data.empty? && token_buffer.empty?
240
240
  # Match all token substrings to our map.
241
241
  token = data[0, max_token_length].join("")
242
- max_token_length.downto(1) do |j|
242
+ max_token_length.downto(1) do |j| # rubocop:disable BlockLength
243
243
  token = token[0, j] unless j == max_token_length
244
244
  if j == 2
245
245
  if !control_char && token == "##"
@@ -302,7 +302,7 @@ module Sanscript
302
302
  # @param data [String] the string to transliterate
303
303
  # @param map [Hash] map data generated from {#make_map}
304
304
  # @return [String] the transliterated string
305
- def transliterate_brahmic(data, map)
305
+ def transliterate_brahmic(data, map) # rubocop:disable MethodLength, CyclomaticComplexity
306
306
  data = data.to_str.chars
307
307
  buf = []
308
308
  had_roman_consonant = false
@@ -1,7 +1,8 @@
1
1
  # frozen_string_literal: true
2
+
2
3
  module Sanscript
3
4
  # The version number
4
- VERSION = "0.8.1"
5
+ VERSION = "0.9.1".freeze
5
6
 
6
7
  GEM_ROOT = Pathname.new(File.realpath(File.join(__dir__, "..", "..")))
7
8
  private_constant :GEM_ROOT
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  # coding: utf-8
3
+
3
4
  lib = File.expand_path("../lib", __FILE__)
4
5
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
6
  require "sanscript/version"
@@ -22,16 +23,16 @@ Gem::Specification.new do |spec|
22
23
 
23
24
  spec.required_ruby_version = "~> 2.2"
24
25
 
26
+ spec.add_development_dependency "benchmark-ips", "~> 2.7"
25
27
  spec.add_development_dependency "bundler", "~> 1.13"
26
- spec.add_development_dependency "rake", "~> 12"
27
- spec.add_development_dependency "rspec", "~> 3.5"
28
28
  spec.add_development_dependency "coveralls", "~> 0.8"
29
- spec.add_development_dependency "rubocop", "~> 0.43"
30
- spec.add_development_dependency "rubocop-rspec", "~> 1.7"
31
29
  spec.add_development_dependency "pry", "~> 0.10"
32
- spec.add_development_dependency "benchmark-ips", "~> 2.7"
30
+ spec.add_development_dependency "rake", "~> 12"
31
+ spec.add_development_dependency "rspec", "~> 3.5"
32
+ spec.add_development_dependency "rubocop", "~> 0.51"
33
+ spec.add_development_dependency "rubocop-rspec", "~> 1.20"
33
34
  spec.add_development_dependency "yard", "~> 0.9"
34
35
 
35
- spec.add_runtime_dependency "ragabash", "~> 0.2"
36
- spec.add_runtime_dependency "thermite", "~> 0.7"
36
+ spec.add_runtime_dependency "ragabash", "~> 0.3"
37
+ spec.add_runtime_dependency "thermite", "~> 0.13"
37
38
  end
@@ -14,13 +14,13 @@ lazy_static! {
14
14
 
15
15
  // Match any character in the block of Brahmic scripts
16
16
  // between Devanagari and Malayalam.
17
- static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
17
+ static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0D7F}]").unwrap();
18
18
 
19
19
  // Match on special Roman characters
20
- static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
20
+ static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻĀĪŪṚṜḶḸĒŌṂḤṄÑṬḌṆŚṢḺ]|[aiueoAIUEO]\x{0304}|[rlRL]\x{0323}\x{0304}?|[mhtdMHTD]\x{0323}|[nN][\x{0307}\x{0303}\x{0323}]|[sS][\x{0301}\x{0323}]|[lL]\x{0331}").unwrap();
21
21
 
22
22
  // Match on Kolkata-specific Roman characters
23
- static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
23
+ static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"[ēōĒŌ]|[eoEO]\x{0304}").unwrap();
24
24
 
25
25
  // Match on ITRANS-only
26
26
  static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
@@ -44,53 +44,47 @@ lazy_static! {
44
44
  #[no_mangle]
45
45
  pub extern fn detect_scheme(s: &str) -> usize {
46
46
  // Clean-up string of control characters.
47
- let r_str = &RE_CONTROL_BLOCK.replace_all(
48
- &RE_ESCAPED_CONTROL_CHAR.replace_all(s, ""), "");
47
+ let r_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(s, "");
48
+ let r_str = &RE_CONTROL_BLOCK.replace_all(r_str, "");
49
49
 
50
50
  // Brahmic schemes are all within a specific range of code points.
51
51
  let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
52
52
  if brahmic_match != None {
53
53
  let brahmic_match = brahmic_match.unwrap();
54
- let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as usize;
54
+ let brahmic_codepoint = r_str.chars().nth(brahmic_match.start()).unwrap() as usize;
55
55
 
56
- if brahmic_codepoint < 0x0980 {
57
- return 1; // Devanagari
58
- } else if brahmic_codepoint < 0x0A00 {
59
- return 2; // Bengali
60
- } else if brahmic_codepoint < 0x0A80 {
61
- return 3; // Gurmukhi
62
- } else if brahmic_codepoint < 0x0B00 {
63
- return 4; // Gujarati
64
- } else if brahmic_codepoint < 0x0B80 {
65
- return 5; // Oriya
66
- } else if brahmic_codepoint < 0x0C00 {
67
- return 6; // Tamil
68
- } else if brahmic_codepoint < 0x0C80 {
69
- return 7; // Telugu
70
- } else if brahmic_codepoint < 0x0D00 {
71
- return 8; // Kannada
72
- } else {
73
- return 9; // Malayalam
56
+ return match brahmic_codepoint {
57
+ 0x0900...0x097F => 1, // Devanagari
58
+ 0x0980...0x09FF => 2, // Bengali
59
+ 0x0A00...0x0A7F => 3, // Gurmukhi
60
+ 0x0A80...0x0AFF => 4, // Gujarati
61
+ 0x0B00...0x0B7F => 5, // Oriya
62
+ 0x0B80...0x0BFF => 6, // Tamil
63
+ 0x0C00...0x0C7F => 7, // Telugu
64
+ 0x0C80...0x0CFF => 8, // Kannada
65
+ 0x0D00...0x0D7F => 9, // Malayalam
66
+ _ => 0
74
67
  }
75
68
  }
76
69
 
77
70
  // Romanizations
78
- if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
71
+ return if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
79
72
  if RE_KOLKATA_ONLY.is_match(r_str) {
80
- return 11; // Kolkata
73
+ 11 // Kolkata
81
74
  } else {
82
- return 10; // IAST
75
+ 10 // IAST
83
76
  }
84
77
  } else if RE_ITRANS_ONLY.is_match(r_str) {
85
- return 12; // ITRANS
78
+ 12 // ITRANS
86
79
  } else if RE_SLP1_ONLY.is_match(r_str) {
87
- return 13; // SLP1
80
+ 13 // SLP1
88
81
  } else if RE_VELTHUIS_ONLY.is_match(r_str) {
89
- return 14; // Velthuis
82
+ 14 // Velthuis
90
83
  } else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
91
- return 12; // ITRANS
84
+ 12 // ITRANS
92
85
  } else if RE_HARVARD_KYOTO.is_match(r_str) {
93
- return 15; // HK
86
+ 15 // HK
87
+ } else {
88
+ 0 // Unknown
94
89
  }
95
- return 0; // Unknown
96
90
  }
data/src/lib.rs CHANGED
@@ -4,3 +4,33 @@ extern crate regex;
4
4
 
5
5
  #[macro_use] mod rb;
6
6
  pub mod detect;
7
+
8
+ #[cfg(test)]
9
+ mod tests {
10
+ use detect::detect_scheme;
11
+
12
+ #[test]
13
+ fn detect_devanagari() {
14
+ assert_eq!(detect_scheme("नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"), 1);
15
+ }
16
+
17
+ #[test]
18
+ fn detect_malayalam() {
19
+ assert_eq!(detect_scheme("നാനാശാസ്ത്രസുഭാഷിതാമൃതരസൈഃ ശ്രോത്രോത്സവം കുര്വതാം യേഷാം യാന്തി ദിനാനി പണ്ഡിതജനവ്യായാമഖിന്നാത്മനാമ് തേഷാം ജന്മ ച ജീവിതം ച സുകൃതം തൈര് ഏവ ഭൂര് ഭൂഷിതാ ശേഷൈഹ് കിം പശുവദ് വിവേകരഹിതൈര് ഭൂഭാരഭൂതൈര് നരഃ"), 9);
20
+ }
21
+
22
+ #[test]
23
+ fn detect_iast() {
24
+ assert_eq!(detect_scheme("nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"), 10);
25
+ }
26
+
27
+ #[test]
28
+ fn detect_slp1() {
29
+ assert_eq!(detect_scheme("nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"), 13);
30
+ }
31
+
32
+ #[test]
33
+ fn detect_hk() {
34
+ assert_eq!(detect_scheme("nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"), 15);
35
+ }
36
+ }
metadata CHANGED
@@ -1,127 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanscript
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Bellefleur
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-01-10 00:00:00.000000000 Z
11
+ date: 2017-11-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
14
+ name: benchmark-ips
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.13'
19
+ version: '2.7'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.13'
26
+ version: '2.7'
27
27
  - !ruby/object:Gem::Dependency
28
- name: rake
28
+ name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '12'
33
+ version: '1.13'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '12'
40
+ version: '1.13'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rspec
42
+ name: coveralls
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '3.5'
47
+ version: '0.8'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '3.5'
54
+ version: '0.8'
55
55
  - !ruby/object:Gem::Dependency
56
- name: coveralls
56
+ name: pry
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0.8'
61
+ version: '0.10'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0.8'
68
+ version: '0.10'
69
69
  - !ruby/object:Gem::Dependency
70
- name: rubocop
70
+ name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0.43'
75
+ version: '12'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0.43'
82
+ version: '12'
83
83
  - !ruby/object:Gem::Dependency
84
- name: rubocop-rspec
84
+ name: rspec
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '1.7'
89
+ version: '3.5'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '1.7'
96
+ version: '3.5'
97
97
  - !ruby/object:Gem::Dependency
98
- name: pry
98
+ name: rubocop
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: '0.10'
103
+ version: '0.51'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: '0.10'
110
+ version: '0.51'
111
111
  - !ruby/object:Gem::Dependency
112
- name: benchmark-ips
112
+ name: rubocop-rspec
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: '2.7'
117
+ version: '1.20'
118
118
  type: :development
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: '2.7'
124
+ version: '1.20'
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: yard
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -142,28 +142,28 @@ dependencies:
142
142
  requirements:
143
143
  - - "~>"
144
144
  - !ruby/object:Gem::Version
145
- version: '0.2'
145
+ version: '0.3'
146
146
  type: :runtime
147
147
  prerelease: false
148
148
  version_requirements: !ruby/object:Gem::Requirement
149
149
  requirements:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
- version: '0.2'
152
+ version: '0.3'
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: thermite
155
155
  requirement: !ruby/object:Gem::Requirement
156
156
  requirements:
157
157
  - - "~>"
158
158
  - !ruby/object:Gem::Version
159
- version: '0.7'
159
+ version: '0.13'
160
160
  type: :runtime
161
161
  prerelease: false
162
162
  version_requirements: !ruby/object:Gem::Requirement
163
163
  requirements:
164
164
  - - "~>"
165
165
  - !ruby/object:Gem::Version
166
- version: '0.7'
166
+ version: '0.13'
167
167
  description:
168
168
  email:
169
169
  - nomoon@phoebus.ca
@@ -222,7 +222,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
222
222
  version: '0'
223
223
  requirements: []
224
224
  rubyforge_project:
225
- rubygems_version: 2.6.8
225
+ rubygems_version: 2.7.2
226
226
  signing_key:
227
227
  specification_version: 4
228
228
  summary: Ruby port and extension of Sanscript.js transliterator by learnsanskrit.org