sanscript 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -3
- data/{rust/Cargo.lock → Cargo.lock} +25 -3
- data/Cargo.toml +16 -0
- data/Rakefile +26 -8
- data/lib/sanscript.rb +25 -0
- data/lib/sanscript/detect.rb +4 -18
- data/lib/sanscript/version.rb +1 -1
- data/sanscript.gemspec +2 -2
- data/src/lib.rs +122 -0
- metadata +11 -12
- data/rust/Cargo.toml +0 -12
- data/rust/extconf.rb +0 -6
- data/rust/src/lib.rs +0 -100
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7faf23075373caded757c32e4e708d85e6e39c10
|
4
|
+
data.tar.gz: 049ea58c3690d4dc6627d6a8a205f19914e69b33
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8bf83a24936460b5331580b094bac0c3aff7e6db4b7839e4fe46b282161ff4d927aa5af8e5fa76afdea3eeb72ecd5353e7f32f15db5bc49bd02153d317ae4e1
|
7
|
+
data.tar.gz: bc16c1526a2d3c89c047c7dbe4b27e5698a7aeb59822505006f2da6e6b7237fb15b7ecd3870ba6b024bb244bcb62912e5ac83cf915e0166f64d9e48d1f4ae77f
|
data/.gitignore
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
[root]
|
2
|
-
name = "
|
3
|
-
version = "0.
|
2
|
+
name = "rusty_sanscript"
|
3
|
+
version = "0.2.0"
|
4
4
|
dependencies = [
|
5
5
|
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
6
|
-
"libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
|
7
6
|
"regex 0.1.73 (registry+https://github.com/rust-lang/crates.io-index)",
|
7
|
+
"ruru 0.7.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
8
|
+
"unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
8
9
|
]
|
9
10
|
|
10
11
|
[[package]]
|
@@ -59,6 +60,22 @@ name = "regex-syntax"
|
|
59
60
|
version = "0.3.4"
|
60
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
61
62
|
|
63
|
+
[[package]]
|
64
|
+
name = "ruby-sys"
|
65
|
+
version = "0.2.6"
|
66
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
67
|
+
dependencies = [
|
68
|
+
"libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
|
69
|
+
]
|
70
|
+
|
71
|
+
[[package]]
|
72
|
+
name = "ruru"
|
73
|
+
version = "0.7.8"
|
74
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
75
|
+
dependencies = [
|
76
|
+
"ruby-sys 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
77
|
+
]
|
78
|
+
|
62
79
|
[[package]]
|
63
80
|
name = "thread-id"
|
64
81
|
version = "2.0.0"
|
@@ -76,6 +93,11 @@ dependencies = [
|
|
76
93
|
"thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
77
94
|
]
|
78
95
|
|
96
|
+
[[package]]
|
97
|
+
name = "unicode-normalization"
|
98
|
+
version = "0.1.2"
|
99
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
100
|
+
|
79
101
|
[[package]]
|
80
102
|
name = "utf8-ranges"
|
81
103
|
version = "0.1.3"
|
data/Cargo.toml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
[package]
|
2
|
+
name = "rusty_sanscript"
|
3
|
+
version = "0.2.0"
|
4
|
+
authors = ["Tim Bellefleur <nomoon@phoebus.ca>"]
|
5
|
+
publish = false
|
6
|
+
|
7
|
+
[dependencies]
|
8
|
+
lazy_static = "^0.2.1"
|
9
|
+
ruru = "^0.7.8"
|
10
|
+
unicode-normalization = "^0.1.2"
|
11
|
+
regex = "^0.1.73"
|
12
|
+
|
13
|
+
[package.metadata.thermite]
|
14
|
+
|
15
|
+
[lib]
|
16
|
+
crate-type = ["dylib"]
|
data/Rakefile
CHANGED
@@ -1,13 +1,31 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
require "bundler/gem_tasks"
|
3
|
-
require "rspec/core/rake_task"
|
4
3
|
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
# Add enhanced optional_build task into Thermite
|
5
|
+
require "thermite/tasks"
|
6
|
+
module Thermite
|
7
|
+
class BetterTasks < Tasks
|
8
|
+
def initialize(options = {})
|
9
|
+
super
|
10
|
+
desc "Run thermite:build task or download binaries, but skip without fail if unavailable."
|
11
|
+
task "thermite:optional_build" do
|
12
|
+
if cargo
|
13
|
+
Rake::Task["thermite:build"].invoke
|
14
|
+
elsif !download_binary
|
15
|
+
puts "Rust and downloadable binaries are not available, skipping."
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
Thermite::BetterTasks.new
|
22
|
+
task default: :"thermite:optional_build"
|
8
23
|
|
9
|
-
|
10
|
-
|
11
|
-
require "
|
12
|
-
|
24
|
+
# Ensure missing RSpec development dependency doesn't kill gem install.
|
25
|
+
begin
|
26
|
+
require "rspec/core/rake_task"
|
27
|
+
RSpec::Core::RakeTask.new(:spec)
|
28
|
+
task default: :spec
|
29
|
+
rescue LoadError
|
30
|
+
nil
|
13
31
|
end
|
data/lib/sanscript.rb
CHANGED
@@ -56,4 +56,29 @@ module Sanscript
|
|
56
56
|
end
|
57
57
|
Transliterate.transliterate(text, from, to, opts)
|
58
58
|
end
|
59
|
+
|
60
|
+
# Override
|
61
|
+
# :nocov:
|
62
|
+
begin
|
63
|
+
require "fiddle"
|
64
|
+
require "thermite/config"
|
65
|
+
|
66
|
+
toplevel_dir = File.dirname(File.dirname(__FILE__))
|
67
|
+
config = Thermite::Config.new(cargo_project_path: toplevel_dir, ruby_project_path: toplevel_dir)
|
68
|
+
library = Fiddle.dlopen(config.ruby_extension_path)
|
69
|
+
module ::RustySanscriptDetect; end # rubocop:disable Style/ClassAndModuleChildren
|
70
|
+
func = Fiddle::Function.new(library["init_rusty_sanscript"],
|
71
|
+
[], Fiddle::TYPE_VOIDP)
|
72
|
+
func.call
|
73
|
+
module Detect
|
74
|
+
extend ::RustySanscriptDetect
|
75
|
+
class << self
|
76
|
+
alias detect_scheme rust_detect_scheme
|
77
|
+
end
|
78
|
+
end
|
79
|
+
RUST_ENABLED = true
|
80
|
+
rescue Fiddle::DLError
|
81
|
+
RUST_ENABLED = false
|
82
|
+
end
|
83
|
+
# :nocov:
|
59
84
|
end
|
data/lib/sanscript/detect.rb
CHANGED
@@ -53,7 +53,8 @@ module Sanscript
|
|
53
53
|
# Attempts to detect the encoding scheme of the provided string.
|
54
54
|
#
|
55
55
|
# Uses the most efficient implementation for your ruby version
|
56
|
-
# (either {Ruby2x#
|
56
|
+
# (either {Ruby2x#ruby_detect_scheme} or {Ruby24#ruby_detect_scheme}) or
|
57
|
+
# the Rust native extension if available.
|
57
58
|
#
|
58
59
|
# @param text [String] a string of Sanskrit text
|
59
60
|
# @return [Symbol, nil] the Symbol of the scheme, or nil if no match
|
@@ -67,24 +68,9 @@ module Sanscript
|
|
67
68
|
require "sanscript/detect/ruby2x"
|
68
69
|
extend Ruby2x
|
69
70
|
end
|
70
|
-
# :nocov:
|
71
|
-
|
72
|
-
# Rust FFI
|
73
71
|
class << self
|
74
|
-
|
75
|
-
require "ffi"
|
76
|
-
extend FFI::Library
|
77
|
-
ffi_lib Dir.glob(File.join(GEM_ROOT, "rust/libsanscript.*")).first
|
78
|
-
attach_function :_rust_detect, :detect, [:string], :int
|
79
|
-
RUST_SCHEMES = %i[devanagari bengali gurmukhi gujarati oriya tamil telugu kannada malayalam iast kolkata itrans slp1 velthuis hk].unshift(nil).freeze
|
80
|
-
private_constant :RUST_SCHEMES
|
81
|
-
def rust_detect_scheme(text)
|
82
|
-
RUST_SCHEMES[_rust_detect(text)]
|
83
|
-
end
|
84
|
-
alias detect_scheme rust_detect_scheme
|
85
|
-
rescue LoadError
|
86
|
-
alias detect_scheme ruby_detect_scheme
|
87
|
-
end
|
72
|
+
alias detect_scheme ruby_detect_scheme
|
88
73
|
end
|
74
|
+
# :nocov:
|
89
75
|
end
|
90
76
|
end
|
data/lib/sanscript/version.rb
CHANGED
data/sanscript.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.bindir = "exe"
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
|
-
spec.extensions =
|
21
|
+
spec.extensions = "Rakefile"
|
22
22
|
|
23
23
|
spec.required_ruby_version = "~> 2.2"
|
24
24
|
|
@@ -33,5 +33,5 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.add_development_dependency "yard", "~> 0.9"
|
34
34
|
|
35
35
|
spec.add_runtime_dependency "ragabash", "~> 0.2"
|
36
|
-
spec.add_runtime_dependency "
|
36
|
+
spec.add_runtime_dependency "thermite", "~> 0.5"
|
37
37
|
end
|
data/src/lib.rs
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
#[macro_use] extern crate lazy_static;
|
2
|
+
#[macro_use] extern crate ruru;
|
3
|
+
extern crate unicode_normalization;
|
4
|
+
extern crate regex;
|
5
|
+
|
6
|
+
use ruru::{AnyObject, Class, NilClass, Symbol, RString};
|
7
|
+
use ruru::traits::Object;
|
8
|
+
#[allow(unused_imports)] use unicode_normalization::UnicodeNormalization;
|
9
|
+
use regex::Regex;
|
10
|
+
|
11
|
+
methods! {
|
12
|
+
Class,
|
13
|
+
_itself,
|
14
|
+
fn detect(s: RString) -> AnyObject {
|
15
|
+
lazy_static! {
|
16
|
+
// # Match escaped control characters
|
17
|
+
static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|\##|\#})").unwrap();
|
18
|
+
|
19
|
+
// # Match ##...## or {#...#} control blocks.
|
20
|
+
static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
|
21
|
+
|
22
|
+
// Match any character in the block of Brahmic scripts
|
23
|
+
// between Devanagari and Malayalam.
|
24
|
+
static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
|
25
|
+
|
26
|
+
// Match on special Roman characters
|
27
|
+
static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
|
28
|
+
|
29
|
+
// Match on Kolkata-specific Roman characters
|
30
|
+
static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
|
31
|
+
|
32
|
+
// Match on ITRANS-only
|
33
|
+
static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
|
34
|
+
|
35
|
+
// Match on SLP1-only characters and bigrams
|
36
|
+
static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
|
37
|
+
|
38
|
+
// Match on Velthuis-only characters
|
39
|
+
static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
|
40
|
+
|
41
|
+
// Match on chars shared by ITRANS and Velthuis
|
42
|
+
static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
|
43
|
+
|
44
|
+
// Match on characters available in Harvard-Kyoto
|
45
|
+
static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
|
46
|
+
|
47
|
+
static ref _DEVANAGARI: Symbol = Symbol::new("devanagari");
|
48
|
+
static ref _BENGALI: Symbol = Symbol::new("bengali");
|
49
|
+
static ref _GURMUKHI: Symbol = Symbol::new("gurmukhi");
|
50
|
+
static ref _GUJARATI: Symbol = Symbol::new("gujarati");
|
51
|
+
static ref _ORIYA: Symbol = Symbol::new("oriya");
|
52
|
+
static ref _TAMIL: Symbol = Symbol::new("tamil");
|
53
|
+
static ref _TELUGU: Symbol = Symbol::new("telugu");
|
54
|
+
static ref _KANNADA: Symbol = Symbol::new("kannada");
|
55
|
+
static ref _MALAYALAM: Symbol = Symbol::new("malayalam");
|
56
|
+
static ref _IAST: Symbol = Symbol::new("iast");
|
57
|
+
static ref _KOLKATA: Symbol = Symbol::new("kolkata");
|
58
|
+
static ref _ITRANS: Symbol = Symbol::new("itrans");
|
59
|
+
static ref _SLP1: Symbol = Symbol::new("slp1");
|
60
|
+
static ref _VELTHUIS: Symbol = Symbol::new("velthuis");
|
61
|
+
static ref _HK: Symbol = Symbol::new("hk");
|
62
|
+
static ref _NIL: NilClass = NilClass::new();
|
63
|
+
}
|
64
|
+
|
65
|
+
let r_replaced_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(&s.to_string(), "");
|
66
|
+
let r_str = &RE_CONTROL_BLOCK.replace_all(r_replaced_str, "");
|
67
|
+
|
68
|
+
// Brahmic schemes are all within a specific range of code points.
|
69
|
+
let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
|
70
|
+
if brahmic_match != None {
|
71
|
+
let brahmic_match = brahmic_match.unwrap();
|
72
|
+
let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as u32;
|
73
|
+
|
74
|
+
if brahmic_codepoint < 0x0980 {
|
75
|
+
return _DEVANAGARI.to_any_object();
|
76
|
+
} else if brahmic_codepoint < 0x0A00 {
|
77
|
+
return _BENGALI.to_any_object();
|
78
|
+
} else if brahmic_codepoint < 0x0A80 {
|
79
|
+
return _GURMUKHI.to_any_object();
|
80
|
+
} else if brahmic_codepoint < 0x0B00 {
|
81
|
+
return _GUJARATI.to_any_object();
|
82
|
+
} else if brahmic_codepoint < 0x0B80 {
|
83
|
+
return _ORIYA.to_any_object();
|
84
|
+
} else if brahmic_codepoint < 0x0C00 {
|
85
|
+
return _TAMIL.to_any_object();
|
86
|
+
} else if brahmic_codepoint < 0x0C80 {
|
87
|
+
return _TELUGU.to_any_object();
|
88
|
+
} else if brahmic_codepoint < 0x0D00 {
|
89
|
+
return _KANNADA.to_any_object();
|
90
|
+
} else {
|
91
|
+
return _MALAYALAM.to_any_object();
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
// Romanizations
|
96
|
+
if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
|
97
|
+
if RE_KOLKATA_ONLY.is_match(r_str) {
|
98
|
+
return _KOLKATA.to_any_object();
|
99
|
+
} else {
|
100
|
+
return _IAST.to_any_object();
|
101
|
+
}
|
102
|
+
} else if RE_ITRANS_ONLY.is_match(r_str) {
|
103
|
+
return _ITRANS.to_any_object();
|
104
|
+
} else if RE_SLP1_ONLY.is_match(r_str) {
|
105
|
+
return _SLP1.to_any_object();
|
106
|
+
} else if RE_VELTHUIS_ONLY.is_match(r_str) {
|
107
|
+
return _VELTHUIS.to_any_object();
|
108
|
+
} else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
|
109
|
+
return _ITRANS.to_any_object();
|
110
|
+
} else if RE_HARVARD_KYOTO.is_match(r_str) {
|
111
|
+
return _HK.to_any_object();
|
112
|
+
}
|
113
|
+
return _NIL.to_any_object();
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
#[no_mangle]
|
118
|
+
pub extern fn init_rusty_sanscript() {
|
119
|
+
Class::from_existing("RustySanscriptDetect").define(|itself| {
|
120
|
+
itself.def("rust_detect_scheme", detect);
|
121
|
+
});
|
122
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-08-
|
11
|
+
date: 2016-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -151,25 +151,25 @@ dependencies:
|
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: '0.2'
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
|
-
name:
|
154
|
+
name: thermite
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
|
-
- - "
|
157
|
+
- - "~>"
|
158
158
|
- !ruby/object:Gem::Version
|
159
|
-
version: '0'
|
159
|
+
version: '0.5'
|
160
160
|
type: :runtime
|
161
161
|
prerelease: false
|
162
162
|
version_requirements: !ruby/object:Gem::Requirement
|
163
163
|
requirements:
|
164
|
-
- - "
|
164
|
+
- - "~>"
|
165
165
|
- !ruby/object:Gem::Version
|
166
|
-
version: '0'
|
166
|
+
version: '0.5'
|
167
167
|
description:
|
168
168
|
email:
|
169
169
|
- nomoon@phoebus.ca
|
170
170
|
executables: []
|
171
171
|
extensions:
|
172
|
-
-
|
172
|
+
- Rakefile
|
173
173
|
extra_rdoc_files: []
|
174
174
|
files:
|
175
175
|
- ".codeclimate.yml"
|
@@ -178,6 +178,8 @@ files:
|
|
178
178
|
- ".rubocop.yml"
|
179
179
|
- ".travis.yml"
|
180
180
|
- CODE_OF_CONDUCT.md
|
181
|
+
- Cargo.lock
|
182
|
+
- Cargo.toml
|
181
183
|
- Gemfile
|
182
184
|
- LICENSE.txt
|
183
185
|
- README.md
|
@@ -193,11 +195,8 @@ files:
|
|
193
195
|
- lib/sanscript/transliterate.rb
|
194
196
|
- lib/sanscript/transliterate/schemes.rb
|
195
197
|
- lib/sanscript/version.rb
|
196
|
-
- rust/Cargo.lock
|
197
|
-
- rust/Cargo.toml
|
198
|
-
- rust/extconf.rb
|
199
|
-
- rust/src/lib.rs
|
200
198
|
- sanscript.gemspec
|
199
|
+
- src/lib.rs
|
201
200
|
homepage: https://github.com/ubcsanskrit/sanscript.rb
|
202
201
|
licenses:
|
203
202
|
- MIT
|
data/rust/Cargo.toml
DELETED
data/rust/extconf.rb
DELETED
@@ -1,6 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
require "mkmf"
|
3
|
-
unless !system("cargo --version") || !system("rustc --version")
|
4
|
-
create_makefile("sanscript")
|
5
|
-
File.write("Makefile", "all:\n\tcargo build --release\n\nclean:\n\trm -rf target\n\ninstall:\n\tmv target/release/libsanscript.* .\n\trm -rf target\n")
|
6
|
-
end
|
data/rust/src/lib.rs
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
extern crate libc;
|
2
|
-
#[macro_use] extern crate lazy_static;
|
3
|
-
extern crate regex;
|
4
|
-
|
5
|
-
use libc::{c_char, int32_t};
|
6
|
-
use std::ffi::CStr;
|
7
|
-
use std::str;
|
8
|
-
use regex::Regex;
|
9
|
-
|
10
|
-
#[no_mangle]
|
11
|
-
pub extern fn detect(s: *const c_char) -> int32_t {
|
12
|
-
let c_str = unsafe {
|
13
|
-
assert!(!s.is_null());
|
14
|
-
|
15
|
-
CStr::from_ptr(s)
|
16
|
-
};
|
17
|
-
|
18
|
-
lazy_static! {
|
19
|
-
// # Match escaped control characters
|
20
|
-
static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|\##|\#})").unwrap();
|
21
|
-
|
22
|
-
// # Match ##...## or {#...#} control blocks.
|
23
|
-
static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
|
24
|
-
|
25
|
-
// Match any character in the block of Brahmic scripts
|
26
|
-
// between Devanagari and Malayalam.
|
27
|
-
static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
|
28
|
-
|
29
|
-
// Match on special Roman characters
|
30
|
-
static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
|
31
|
-
|
32
|
-
// Match on Kolkata-specific Roman characters
|
33
|
-
static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
|
34
|
-
|
35
|
-
// Match on ITRANS-only
|
36
|
-
static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
|
37
|
-
|
38
|
-
// Match on SLP1-only characters and bigrams
|
39
|
-
static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
|
40
|
-
|
41
|
-
// Match on Velthuis-only characters
|
42
|
-
static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
|
43
|
-
|
44
|
-
// Match on chars shared by ITRANS and Velthuis
|
45
|
-
static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
|
46
|
-
|
47
|
-
// Match on characters available in Harvard-Kyoto
|
48
|
-
static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
|
49
|
-
}
|
50
|
-
|
51
|
-
let r_replaced_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(c_str.to_str().unwrap(), "");
|
52
|
-
let r_str = &RE_CONTROL_BLOCK.replace_all(r_replaced_str, "");
|
53
|
-
|
54
|
-
// Brahmic schemes are all within a specific range of code points.
|
55
|
-
let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
|
56
|
-
if brahmic_match != None {
|
57
|
-
let brahmic_match = brahmic_match.unwrap();
|
58
|
-
let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as u32;
|
59
|
-
|
60
|
-
if brahmic_codepoint < 0x0980 {
|
61
|
-
return 1;
|
62
|
-
} else if brahmic_codepoint < 0x0A00 {
|
63
|
-
return 2;
|
64
|
-
} else if brahmic_codepoint < 0x0A80 {
|
65
|
-
return 3;
|
66
|
-
} else if brahmic_codepoint < 0x0B00 {
|
67
|
-
return 4;
|
68
|
-
} else if brahmic_codepoint < 0x0B80 {
|
69
|
-
return 5;
|
70
|
-
} else if brahmic_codepoint < 0x0C00 {
|
71
|
-
return 6;
|
72
|
-
} else if brahmic_codepoint < 0x0C80 {
|
73
|
-
return 7;
|
74
|
-
} else if brahmic_codepoint < 0x0D00 {
|
75
|
-
return 8;
|
76
|
-
} else {
|
77
|
-
return 9;
|
78
|
-
}
|
79
|
-
}
|
80
|
-
|
81
|
-
// Romanizations
|
82
|
-
if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
|
83
|
-
if RE_KOLKATA_ONLY.is_match(r_str) {
|
84
|
-
return 11;
|
85
|
-
} else {
|
86
|
-
return 10;
|
87
|
-
}
|
88
|
-
} else if RE_ITRANS_ONLY.is_match(r_str) {
|
89
|
-
return 12;
|
90
|
-
} else if RE_SLP1_ONLY.is_match(r_str) {
|
91
|
-
return 13;
|
92
|
-
} else if RE_VELTHUIS_ONLY.is_match(r_str) {
|
93
|
-
return 14;
|
94
|
-
} else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
|
95
|
-
return 12;
|
96
|
-
} else if RE_HARVARD_KYOTO.is_match(r_str) {
|
97
|
-
return 15;
|
98
|
-
}
|
99
|
-
return 0;
|
100
|
-
}
|