sanscript 0.6.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -3
- data/{rust/Cargo.lock → Cargo.lock} +25 -3
- data/Cargo.toml +16 -0
- data/Rakefile +26 -8
- data/lib/sanscript.rb +25 -0
- data/lib/sanscript/detect.rb +4 -18
- data/lib/sanscript/version.rb +1 -1
- data/sanscript.gemspec +2 -2
- data/src/lib.rs +122 -0
- metadata +11 -12
- data/rust/Cargo.toml +0 -12
- data/rust/extconf.rb +0 -6
- data/rust/src/lib.rs +0 -100
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7faf23075373caded757c32e4e708d85e6e39c10
|
4
|
+
data.tar.gz: 049ea58c3690d4dc6627d6a8a205f19914e69b33
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8bf83a24936460b5331580b094bac0c3aff7e6db4b7839e4fe46b282161ff4d927aa5af8e5fa76afdea3eeb72ecd5353e7f32f15db5bc49bd02153d317ae4e1
|
7
|
+
data.tar.gz: bc16c1526a2d3c89c047c7dbe4b27e5698a7aeb59822505006f2da6e6b7237fb15b7ecd3870ba6b024bb244bcb62912e5ac83cf915e0166f64d9e48d1f4ae77f
|
data/.gitignore
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
[root]
|
2
|
-
name = "
|
3
|
-
version = "0.
|
2
|
+
name = "rusty_sanscript"
|
3
|
+
version = "0.2.0"
|
4
4
|
dependencies = [
|
5
5
|
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
6
|
-
"libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
|
7
6
|
"regex 0.1.73 (registry+https://github.com/rust-lang/crates.io-index)",
|
7
|
+
"ruru 0.7.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
8
|
+
"unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
8
9
|
]
|
9
10
|
|
10
11
|
[[package]]
|
@@ -59,6 +60,22 @@ name = "regex-syntax"
|
|
59
60
|
version = "0.3.4"
|
60
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
61
62
|
|
63
|
+
[[package]]
|
64
|
+
name = "ruby-sys"
|
65
|
+
version = "0.2.6"
|
66
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
67
|
+
dependencies = [
|
68
|
+
"libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
|
69
|
+
]
|
70
|
+
|
71
|
+
[[package]]
|
72
|
+
name = "ruru"
|
73
|
+
version = "0.7.8"
|
74
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
75
|
+
dependencies = [
|
76
|
+
"ruby-sys 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
77
|
+
]
|
78
|
+
|
62
79
|
[[package]]
|
63
80
|
name = "thread-id"
|
64
81
|
version = "2.0.0"
|
@@ -76,6 +93,11 @@ dependencies = [
|
|
76
93
|
"thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
77
94
|
]
|
78
95
|
|
96
|
+
[[package]]
|
97
|
+
name = "unicode-normalization"
|
98
|
+
version = "0.1.2"
|
99
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
100
|
+
|
79
101
|
[[package]]
|
80
102
|
name = "utf8-ranges"
|
81
103
|
version = "0.1.3"
|
data/Cargo.toml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
[package]
|
2
|
+
name = "rusty_sanscript"
|
3
|
+
version = "0.2.0"
|
4
|
+
authors = ["Tim Bellefleur <nomoon@phoebus.ca>"]
|
5
|
+
publish = false
|
6
|
+
|
7
|
+
[dependencies]
|
8
|
+
lazy_static = "^0.2.1"
|
9
|
+
ruru = "^0.7.8"
|
10
|
+
unicode-normalization = "^0.1.2"
|
11
|
+
regex = "^0.1.73"
|
12
|
+
|
13
|
+
[package.metadata.thermite]
|
14
|
+
|
15
|
+
[lib]
|
16
|
+
crate-type = ["dylib"]
|
data/Rakefile
CHANGED
@@ -1,13 +1,31 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
require "bundler/gem_tasks"
|
3
|
-
require "rspec/core/rake_task"
|
4
3
|
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
# Add enhanced optional_build task into Thermite
|
5
|
+
require "thermite/tasks"
|
6
|
+
module Thermite
|
7
|
+
class BetterTasks < Tasks
|
8
|
+
def initialize(options = {})
|
9
|
+
super
|
10
|
+
desc "Run thermite:build task or download binaries, but skip without fail if unavailable."
|
11
|
+
task "thermite:optional_build" do
|
12
|
+
if cargo
|
13
|
+
Rake::Task["thermite:build"].invoke
|
14
|
+
elsif !download_binary
|
15
|
+
puts "Rust and downloadable binaries are not available, skipping."
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
Thermite::BetterTasks.new
|
22
|
+
task default: :"thermite:optional_build"
|
8
23
|
|
9
|
-
|
10
|
-
|
11
|
-
require "
|
12
|
-
|
24
|
+
# Ensure missing RSpec development dependency doesn't kill gem install.
|
25
|
+
begin
|
26
|
+
require "rspec/core/rake_task"
|
27
|
+
RSpec::Core::RakeTask.new(:spec)
|
28
|
+
task default: :spec
|
29
|
+
rescue LoadError
|
30
|
+
nil
|
13
31
|
end
|
data/lib/sanscript.rb
CHANGED
@@ -56,4 +56,29 @@ module Sanscript
|
|
56
56
|
end
|
57
57
|
Transliterate.transliterate(text, from, to, opts)
|
58
58
|
end
|
59
|
+
|
60
|
+
# Override
|
61
|
+
# :nocov:
|
62
|
+
begin
|
63
|
+
require "fiddle"
|
64
|
+
require "thermite/config"
|
65
|
+
|
66
|
+
toplevel_dir = File.dirname(File.dirname(__FILE__))
|
67
|
+
config = Thermite::Config.new(cargo_project_path: toplevel_dir, ruby_project_path: toplevel_dir)
|
68
|
+
library = Fiddle.dlopen(config.ruby_extension_path)
|
69
|
+
module ::RustySanscriptDetect; end # rubocop:disable Style/ClassAndModuleChildren
|
70
|
+
func = Fiddle::Function.new(library["init_rusty_sanscript"],
|
71
|
+
[], Fiddle::TYPE_VOIDP)
|
72
|
+
func.call
|
73
|
+
module Detect
|
74
|
+
extend ::RustySanscriptDetect
|
75
|
+
class << self
|
76
|
+
alias detect_scheme rust_detect_scheme
|
77
|
+
end
|
78
|
+
end
|
79
|
+
RUST_ENABLED = true
|
80
|
+
rescue Fiddle::DLError
|
81
|
+
RUST_ENABLED = false
|
82
|
+
end
|
83
|
+
# :nocov:
|
59
84
|
end
|
data/lib/sanscript/detect.rb
CHANGED
@@ -53,7 +53,8 @@ module Sanscript
|
|
53
53
|
# Attempts to detect the encoding scheme of the provided string.
|
54
54
|
#
|
55
55
|
# Uses the most efficient implementation for your ruby version
|
56
|
-
# (either {Ruby2x#
|
56
|
+
# (either {Ruby2x#ruby_detect_scheme} or {Ruby24#ruby_detect_scheme}) or
|
57
|
+
# the Rust native extension if available.
|
57
58
|
#
|
58
59
|
# @param text [String] a string of Sanskrit text
|
59
60
|
# @return [Symbol, nil] the Symbol of the scheme, or nil if no match
|
@@ -67,24 +68,9 @@ module Sanscript
|
|
67
68
|
require "sanscript/detect/ruby2x"
|
68
69
|
extend Ruby2x
|
69
70
|
end
|
70
|
-
# :nocov:
|
71
|
-
|
72
|
-
# Rust FFI
|
73
71
|
class << self
|
74
|
-
|
75
|
-
require "ffi"
|
76
|
-
extend FFI::Library
|
77
|
-
ffi_lib Dir.glob(File.join(GEM_ROOT, "rust/libsanscript.*")).first
|
78
|
-
attach_function :_rust_detect, :detect, [:string], :int
|
79
|
-
RUST_SCHEMES = %i[devanagari bengali gurmukhi gujarati oriya tamil telugu kannada malayalam iast kolkata itrans slp1 velthuis hk].unshift(nil).freeze
|
80
|
-
private_constant :RUST_SCHEMES
|
81
|
-
def rust_detect_scheme(text)
|
82
|
-
RUST_SCHEMES[_rust_detect(text)]
|
83
|
-
end
|
84
|
-
alias detect_scheme rust_detect_scheme
|
85
|
-
rescue LoadError
|
86
|
-
alias detect_scheme ruby_detect_scheme
|
87
|
-
end
|
72
|
+
alias detect_scheme ruby_detect_scheme
|
88
73
|
end
|
74
|
+
# :nocov:
|
89
75
|
end
|
90
76
|
end
|
data/lib/sanscript/version.rb
CHANGED
data/sanscript.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.bindir = "exe"
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
|
-
spec.extensions =
|
21
|
+
spec.extensions = "Rakefile"
|
22
22
|
|
23
23
|
spec.required_ruby_version = "~> 2.2"
|
24
24
|
|
@@ -33,5 +33,5 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.add_development_dependency "yard", "~> 0.9"
|
34
34
|
|
35
35
|
spec.add_runtime_dependency "ragabash", "~> 0.2"
|
36
|
-
spec.add_runtime_dependency "
|
36
|
+
spec.add_runtime_dependency "thermite", "~> 0.5"
|
37
37
|
end
|
data/src/lib.rs
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
#[macro_use] extern crate lazy_static;
|
2
|
+
#[macro_use] extern crate ruru;
|
3
|
+
extern crate unicode_normalization;
|
4
|
+
extern crate regex;
|
5
|
+
|
6
|
+
use ruru::{AnyObject, Class, NilClass, Symbol, RString};
|
7
|
+
use ruru::traits::Object;
|
8
|
+
#[allow(unused_imports)] use unicode_normalization::UnicodeNormalization;
|
9
|
+
use regex::Regex;
|
10
|
+
|
11
|
+
methods! {
|
12
|
+
Class,
|
13
|
+
_itself,
|
14
|
+
fn detect(s: RString) -> AnyObject {
|
15
|
+
lazy_static! {
|
16
|
+
// # Match escaped control characters
|
17
|
+
static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|\##|\#})").unwrap();
|
18
|
+
|
19
|
+
// # Match ##...## or {#...#} control blocks.
|
20
|
+
static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
|
21
|
+
|
22
|
+
// Match any character in the block of Brahmic scripts
|
23
|
+
// between Devanagari and Malayalam.
|
24
|
+
static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
|
25
|
+
|
26
|
+
// Match on special Roman characters
|
27
|
+
static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
|
28
|
+
|
29
|
+
// Match on Kolkata-specific Roman characters
|
30
|
+
static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
|
31
|
+
|
32
|
+
// Match on ITRANS-only
|
33
|
+
static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
|
34
|
+
|
35
|
+
// Match on SLP1-only characters and bigrams
|
36
|
+
static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
|
37
|
+
|
38
|
+
// Match on Velthuis-only characters
|
39
|
+
static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
|
40
|
+
|
41
|
+
// Match on chars shared by ITRANS and Velthuis
|
42
|
+
static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
|
43
|
+
|
44
|
+
// Match on characters available in Harvard-Kyoto
|
45
|
+
static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
|
46
|
+
|
47
|
+
static ref _DEVANAGARI: Symbol = Symbol::new("devanagari");
|
48
|
+
static ref _BENGALI: Symbol = Symbol::new("bengali");
|
49
|
+
static ref _GURMUKHI: Symbol = Symbol::new("gurmukhi");
|
50
|
+
static ref _GUJARATI: Symbol = Symbol::new("gujarati");
|
51
|
+
static ref _ORIYA: Symbol = Symbol::new("oriya");
|
52
|
+
static ref _TAMIL: Symbol = Symbol::new("tamil");
|
53
|
+
static ref _TELUGU: Symbol = Symbol::new("telugu");
|
54
|
+
static ref _KANNADA: Symbol = Symbol::new("kannada");
|
55
|
+
static ref _MALAYALAM: Symbol = Symbol::new("malayalam");
|
56
|
+
static ref _IAST: Symbol = Symbol::new("iast");
|
57
|
+
static ref _KOLKATA: Symbol = Symbol::new("kolkata");
|
58
|
+
static ref _ITRANS: Symbol = Symbol::new("itrans");
|
59
|
+
static ref _SLP1: Symbol = Symbol::new("slp1");
|
60
|
+
static ref _VELTHUIS: Symbol = Symbol::new("velthuis");
|
61
|
+
static ref _HK: Symbol = Symbol::new("hk");
|
62
|
+
static ref _NIL: NilClass = NilClass::new();
|
63
|
+
}
|
64
|
+
|
65
|
+
let r_replaced_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(&s.to_string(), "");
|
66
|
+
let r_str = &RE_CONTROL_BLOCK.replace_all(r_replaced_str, "");
|
67
|
+
|
68
|
+
// Brahmic schemes are all within a specific range of code points.
|
69
|
+
let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
|
70
|
+
if brahmic_match != None {
|
71
|
+
let brahmic_match = brahmic_match.unwrap();
|
72
|
+
let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as u32;
|
73
|
+
|
74
|
+
if brahmic_codepoint < 0x0980 {
|
75
|
+
return _DEVANAGARI.to_any_object();
|
76
|
+
} else if brahmic_codepoint < 0x0A00 {
|
77
|
+
return _BENGALI.to_any_object();
|
78
|
+
} else if brahmic_codepoint < 0x0A80 {
|
79
|
+
return _GURMUKHI.to_any_object();
|
80
|
+
} else if brahmic_codepoint < 0x0B00 {
|
81
|
+
return _GUJARATI.to_any_object();
|
82
|
+
} else if brahmic_codepoint < 0x0B80 {
|
83
|
+
return _ORIYA.to_any_object();
|
84
|
+
} else if brahmic_codepoint < 0x0C00 {
|
85
|
+
return _TAMIL.to_any_object();
|
86
|
+
} else if brahmic_codepoint < 0x0C80 {
|
87
|
+
return _TELUGU.to_any_object();
|
88
|
+
} else if brahmic_codepoint < 0x0D00 {
|
89
|
+
return _KANNADA.to_any_object();
|
90
|
+
} else {
|
91
|
+
return _MALAYALAM.to_any_object();
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
// Romanizations
|
96
|
+
if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
|
97
|
+
if RE_KOLKATA_ONLY.is_match(r_str) {
|
98
|
+
return _KOLKATA.to_any_object();
|
99
|
+
} else {
|
100
|
+
return _IAST.to_any_object();
|
101
|
+
}
|
102
|
+
} else if RE_ITRANS_ONLY.is_match(r_str) {
|
103
|
+
return _ITRANS.to_any_object();
|
104
|
+
} else if RE_SLP1_ONLY.is_match(r_str) {
|
105
|
+
return _SLP1.to_any_object();
|
106
|
+
} else if RE_VELTHUIS_ONLY.is_match(r_str) {
|
107
|
+
return _VELTHUIS.to_any_object();
|
108
|
+
} else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
|
109
|
+
return _ITRANS.to_any_object();
|
110
|
+
} else if RE_HARVARD_KYOTO.is_match(r_str) {
|
111
|
+
return _HK.to_any_object();
|
112
|
+
}
|
113
|
+
return _NIL.to_any_object();
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
#[no_mangle]
|
118
|
+
pub extern fn init_rusty_sanscript() {
|
119
|
+
Class::from_existing("RustySanscriptDetect").define(|itself| {
|
120
|
+
itself.def("rust_detect_scheme", detect);
|
121
|
+
});
|
122
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-08-
|
11
|
+
date: 2016-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -151,25 +151,25 @@ dependencies:
|
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: '0.2'
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
|
-
name:
|
154
|
+
name: thermite
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
|
-
- - "
|
157
|
+
- - "~>"
|
158
158
|
- !ruby/object:Gem::Version
|
159
|
-
version: '0'
|
159
|
+
version: '0.5'
|
160
160
|
type: :runtime
|
161
161
|
prerelease: false
|
162
162
|
version_requirements: !ruby/object:Gem::Requirement
|
163
163
|
requirements:
|
164
|
-
- - "
|
164
|
+
- - "~>"
|
165
165
|
- !ruby/object:Gem::Version
|
166
|
-
version: '0'
|
166
|
+
version: '0.5'
|
167
167
|
description:
|
168
168
|
email:
|
169
169
|
- nomoon@phoebus.ca
|
170
170
|
executables: []
|
171
171
|
extensions:
|
172
|
-
-
|
172
|
+
- Rakefile
|
173
173
|
extra_rdoc_files: []
|
174
174
|
files:
|
175
175
|
- ".codeclimate.yml"
|
@@ -178,6 +178,8 @@ files:
|
|
178
178
|
- ".rubocop.yml"
|
179
179
|
- ".travis.yml"
|
180
180
|
- CODE_OF_CONDUCT.md
|
181
|
+
- Cargo.lock
|
182
|
+
- Cargo.toml
|
181
183
|
- Gemfile
|
182
184
|
- LICENSE.txt
|
183
185
|
- README.md
|
@@ -193,11 +195,8 @@ files:
|
|
193
195
|
- lib/sanscript/transliterate.rb
|
194
196
|
- lib/sanscript/transliterate/schemes.rb
|
195
197
|
- lib/sanscript/version.rb
|
196
|
-
- rust/Cargo.lock
|
197
|
-
- rust/Cargo.toml
|
198
|
-
- rust/extconf.rb
|
199
|
-
- rust/src/lib.rs
|
200
198
|
- sanscript.gemspec
|
199
|
+
- src/lib.rs
|
201
200
|
homepage: https://github.com/ubcsanskrit/sanscript.rb
|
202
201
|
licenses:
|
203
202
|
- MIT
|
data/rust/Cargo.toml
DELETED
data/rust/extconf.rb
DELETED
@@ -1,6 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
require "mkmf"
|
3
|
-
unless !system("cargo --version") || !system("rustc --version")
|
4
|
-
create_makefile("sanscript")
|
5
|
-
File.write("Makefile", "all:\n\tcargo build --release\n\nclean:\n\trm -rf target\n\ninstall:\n\tmv target/release/libsanscript.* .\n\trm -rf target\n")
|
6
|
-
end
|
data/rust/src/lib.rs
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
extern crate libc;
|
2
|
-
#[macro_use] extern crate lazy_static;
|
3
|
-
extern crate regex;
|
4
|
-
|
5
|
-
use libc::{c_char, int32_t};
|
6
|
-
use std::ffi::CStr;
|
7
|
-
use std::str;
|
8
|
-
use regex::Regex;
|
9
|
-
|
10
|
-
#[no_mangle]
|
11
|
-
pub extern fn detect(s: *const c_char) -> int32_t {
|
12
|
-
let c_str = unsafe {
|
13
|
-
assert!(!s.is_null());
|
14
|
-
|
15
|
-
CStr::from_ptr(s)
|
16
|
-
};
|
17
|
-
|
18
|
-
lazy_static! {
|
19
|
-
// # Match escaped control characters
|
20
|
-
static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|\##|\#})").unwrap();
|
21
|
-
|
22
|
-
// # Match ##...## or {#...#} control blocks.
|
23
|
-
static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
|
24
|
-
|
25
|
-
// Match any character in the block of Brahmic scripts
|
26
|
-
// between Devanagari and Malayalam.
|
27
|
-
static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
|
28
|
-
|
29
|
-
// Match on special Roman characters
|
30
|
-
static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
|
31
|
-
|
32
|
-
// Match on Kolkata-specific Roman characters
|
33
|
-
static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
|
34
|
-
|
35
|
-
// Match on ITRANS-only
|
36
|
-
static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
|
37
|
-
|
38
|
-
// Match on SLP1-only characters and bigrams
|
39
|
-
static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
|
40
|
-
|
41
|
-
// Match on Velthuis-only characters
|
42
|
-
static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
|
43
|
-
|
44
|
-
// Match on chars shared by ITRANS and Velthuis
|
45
|
-
static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
|
46
|
-
|
47
|
-
// Match on characters available in Harvard-Kyoto
|
48
|
-
static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
|
49
|
-
}
|
50
|
-
|
51
|
-
let r_replaced_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(c_str.to_str().unwrap(), "");
|
52
|
-
let r_str = &RE_CONTROL_BLOCK.replace_all(r_replaced_str, "");
|
53
|
-
|
54
|
-
// Brahmic schemes are all within a specific range of code points.
|
55
|
-
let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
|
56
|
-
if brahmic_match != None {
|
57
|
-
let brahmic_match = brahmic_match.unwrap();
|
58
|
-
let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as u32;
|
59
|
-
|
60
|
-
if brahmic_codepoint < 0x0980 {
|
61
|
-
return 1;
|
62
|
-
} else if brahmic_codepoint < 0x0A00 {
|
63
|
-
return 2;
|
64
|
-
} else if brahmic_codepoint < 0x0A80 {
|
65
|
-
return 3;
|
66
|
-
} else if brahmic_codepoint < 0x0B00 {
|
67
|
-
return 4;
|
68
|
-
} else if brahmic_codepoint < 0x0B80 {
|
69
|
-
return 5;
|
70
|
-
} else if brahmic_codepoint < 0x0C00 {
|
71
|
-
return 6;
|
72
|
-
} else if brahmic_codepoint < 0x0C80 {
|
73
|
-
return 7;
|
74
|
-
} else if brahmic_codepoint < 0x0D00 {
|
75
|
-
return 8;
|
76
|
-
} else {
|
77
|
-
return 9;
|
78
|
-
}
|
79
|
-
}
|
80
|
-
|
81
|
-
// Romanizations
|
82
|
-
if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
|
83
|
-
if RE_KOLKATA_ONLY.is_match(r_str) {
|
84
|
-
return 11;
|
85
|
-
} else {
|
86
|
-
return 10;
|
87
|
-
}
|
88
|
-
} else if RE_ITRANS_ONLY.is_match(r_str) {
|
89
|
-
return 12;
|
90
|
-
} else if RE_SLP1_ONLY.is_match(r_str) {
|
91
|
-
return 13;
|
92
|
-
} else if RE_VELTHUIS_ONLY.is_match(r_str) {
|
93
|
-
return 14;
|
94
|
-
} else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
|
95
|
-
return 12;
|
96
|
-
} else if RE_HARVARD_KYOTO.is_match(r_str) {
|
97
|
-
return 15;
|
98
|
-
}
|
99
|
-
return 0;
|
100
|
-
}
|