sanscript 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +1 -0
- data/README.md +5 -3
- data/Rakefile +6 -0
- data/lib/sanscript/detect.rb +18 -2
- data/lib/sanscript/detect/ruby24.rb +1 -1
- data/lib/sanscript/detect/ruby2x.rb +1 -1
- data/lib/sanscript/version.rb +4 -1
- data/rust/Cargo.lock +93 -0
- data/rust/Cargo.toml +12 -0
- data/rust/extconf.rb +6 -0
- data/rust/src/lib.rs +100 -0
- data/sanscript.gemspec +5 -1
- metadata +52 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dd5f98881a344f630109b020f7614bf3c860a919
|
4
|
+
data.tar.gz: e3a1353bf994c3aabee9b779cc35e70c98f70bb3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 925e8af64d8eeed22da8b35ffa3b569b80da03956b419bc5055c7bfafc6e52f80dffbdf810f7c538ef50ec103df9431c334295f84811f861d5a78d88a30dc2c5
|
7
|
+
data.tar.gz: 82d5cba98f7a7f38899df4bdc20a72597882743ed7fc2a484da9c2d96097dea075cbcfa433b3db20a0ef15075a962d0022851c250a80f337e218d4f2411dc31d
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/README.md
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
# Sanscript.rb
|
2
2
|
|
3
|
+
[](https://badge.fury.io/rb/sanscript)
|
4
|
+
[](https://gemnasium.com/github.com/ubcsanskrit/sanscript.rb)
|
3
5
|
[](https://travis-ci.org/ubcsanskrit/sanscript.rb)
|
4
|
-
[](https://codeclimate.com/github/ubcsanskrit/sanscript.rb)
|
5
6
|
[](https://codeclimate.com/github/ubcsanskrit/sanscript.rb/coverage)
|
6
|
-
[](https://codeclimate.com/github/ubcsanskrit/sanscript.rb)
|
8
|
+
[](http://inch-ci.org/github/ubcsanskrit/sanscript.rb)
|
7
9
|
|
8
10
|
This gem is starting off as a mostly-straightforward port of [learnsanskrit.org's Sanscript.js](https://github.com/sanskrit/sanscript.js), and will go from there. It also incorporates transliteration scheme detection based on [learnsanskrit.org's Detect.js](https://github.com/sanskrit/detect.js).
|
9
11
|
|
@@ -33,7 +35,7 @@ Documentation is provided in YARD format and available online at [rubydoc.info](
|
|
33
35
|
|
34
36
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
35
37
|
|
36
|
-
To install this gem onto your local machine, run `bundle exec rake install`.
|
38
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
37
39
|
|
38
40
|
## Contributing
|
39
41
|
|
data/Rakefile
CHANGED
data/lib/sanscript/detect.rb
CHANGED
@@ -49,8 +49,6 @@ module Sanscript
|
|
49
49
|
:RE_KOLKATA_ONLY, :RE_ITRANS_ONLY, :RE_SLP1_ONLY, :RE_VELTHUIS_ONLY,
|
50
50
|
:RE_ITRANS_OR_VELTHUIS_ONLY, :RE_HARVARD_KYOTO, :RE_CONTROL_BLOCK
|
51
51
|
|
52
|
-
module_function
|
53
|
-
|
54
52
|
# @!method detect_scheme(text)
|
55
53
|
# Attempts to detect the encoding scheme of the provided string.
|
56
54
|
#
|
@@ -70,5 +68,23 @@ module Sanscript
|
|
70
68
|
extend Ruby2x
|
71
69
|
end
|
72
70
|
# :nocov:
|
71
|
+
|
72
|
+
# Rust FFI
|
73
|
+
class << self
|
74
|
+
begin
|
75
|
+
require "ffi"
|
76
|
+
extend FFI::Library
|
77
|
+
ffi_lib Dir.glob(File.join(GEM_ROOT, "rust/libsanscript.*")).first
|
78
|
+
attach_function :_rust_detect, :detect, [:string], :int
|
79
|
+
RUST_SCHEMES = %i[devanagari bengali gurmukhi gujarati oriya tamil telugu kannada malayalam iast kolkata itrans slp1 velthuis hk].unshift(nil).freeze
|
80
|
+
private_constant :RUST_SCHEMES
|
81
|
+
def rust_detect_scheme(text)
|
82
|
+
RUST_SCHEMES[_rust_detect(text)]
|
83
|
+
end
|
84
|
+
alias detect_scheme rust_detect_scheme
|
85
|
+
rescue LoadError
|
86
|
+
alias detect_scheme ruby_detect_scheme
|
87
|
+
end
|
88
|
+
end
|
73
89
|
end
|
74
90
|
end
|
@@ -9,7 +9,7 @@ module Sanscript
|
|
9
9
|
#
|
10
10
|
# @param text [String] a string of Sanskrit text
|
11
11
|
# @return [Symbol, nil] the Symbol of the scheme, or nil if no match
|
12
|
-
def
|
12
|
+
def ruby_detect_scheme(text)
|
13
13
|
text = text.to_str.gsub(RE_CONTROL_BLOCK, "")
|
14
14
|
|
15
15
|
# Brahmic schemes are all within a specific range of code points.
|
@@ -8,7 +8,7 @@ module Sanscript
|
|
8
8
|
#
|
9
9
|
# @param text [String] a string of Sanskrit text
|
10
10
|
# @return [Symbol, nil] the Symbol of the scheme, or nil if no match
|
11
|
-
def
|
11
|
+
def ruby_detect_scheme(text)
|
12
12
|
text = text.to_str.gsub(RE_CONTROL_BLOCK, "")
|
13
13
|
|
14
14
|
# rubocop:disable Style/CaseEquality
|
data/lib/sanscript/version.rb
CHANGED
data/rust/Cargo.lock
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
[root]
|
2
|
+
name = "sanscript"
|
3
|
+
version = "0.1.0"
|
4
|
+
dependencies = [
|
5
|
+
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
6
|
+
"libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
|
7
|
+
"regex 0.1.73 (registry+https://github.com/rust-lang/crates.io-index)",
|
8
|
+
]
|
9
|
+
|
10
|
+
[[package]]
|
11
|
+
name = "aho-corasick"
|
12
|
+
version = "0.5.2"
|
13
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
14
|
+
dependencies = [
|
15
|
+
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
16
|
+
]
|
17
|
+
|
18
|
+
[[package]]
|
19
|
+
name = "kernel32-sys"
|
20
|
+
version = "0.2.2"
|
21
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
22
|
+
dependencies = [
|
23
|
+
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
24
|
+
"winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
25
|
+
]
|
26
|
+
|
27
|
+
[[package]]
|
28
|
+
name = "lazy_static"
|
29
|
+
version = "0.2.1"
|
30
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
31
|
+
|
32
|
+
[[package]]
|
33
|
+
name = "libc"
|
34
|
+
version = "0.2.15"
|
35
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
36
|
+
|
37
|
+
[[package]]
|
38
|
+
name = "memchr"
|
39
|
+
version = "0.1.11"
|
40
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
41
|
+
dependencies = [
|
42
|
+
"libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
|
43
|
+
]
|
44
|
+
|
45
|
+
[[package]]
|
46
|
+
name = "regex"
|
47
|
+
version = "0.1.73"
|
48
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
49
|
+
dependencies = [
|
50
|
+
"aho-corasick 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
51
|
+
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
52
|
+
"regex-syntax 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
53
|
+
"thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
54
|
+
"utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
55
|
+
]
|
56
|
+
|
57
|
+
[[package]]
|
58
|
+
name = "regex-syntax"
|
59
|
+
version = "0.3.4"
|
60
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
61
|
+
|
62
|
+
[[package]]
|
63
|
+
name = "thread-id"
|
64
|
+
version = "2.0.0"
|
65
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
66
|
+
dependencies = [
|
67
|
+
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
68
|
+
"libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
|
69
|
+
]
|
70
|
+
|
71
|
+
[[package]]
|
72
|
+
name = "thread_local"
|
73
|
+
version = "0.2.6"
|
74
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
75
|
+
dependencies = [
|
76
|
+
"thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
77
|
+
]
|
78
|
+
|
79
|
+
[[package]]
|
80
|
+
name = "utf8-ranges"
|
81
|
+
version = "0.1.3"
|
82
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
83
|
+
|
84
|
+
[[package]]
|
85
|
+
name = "winapi"
|
86
|
+
version = "0.2.8"
|
87
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
88
|
+
|
89
|
+
[[package]]
|
90
|
+
name = "winapi-build"
|
91
|
+
version = "0.1.1"
|
92
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
93
|
+
|
data/rust/Cargo.toml
ADDED
data/rust/extconf.rb
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "mkmf"
|
3
|
+
unless !system("cargo --version") || !system("rustc --version")
|
4
|
+
create_makefile("sanscript")
|
5
|
+
File.write("Makefile", "all:\n\tcargo build --release\n\nclean:\n\trm -rf target\n\ninstall:\n\tmv target/release/libsanscript.* .\n\trm -rf target\n")
|
6
|
+
end
|
data/rust/src/lib.rs
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
extern crate libc;
|
2
|
+
#[macro_use] extern crate lazy_static;
|
3
|
+
extern crate regex;
|
4
|
+
|
5
|
+
use libc::{c_char, int32_t};
|
6
|
+
use std::ffi::CStr;
|
7
|
+
use std::str;
|
8
|
+
use regex::Regex;
|
9
|
+
|
10
|
+
#[no_mangle]
|
11
|
+
pub extern fn detect(s: *const c_char) -> int32_t {
|
12
|
+
let c_str = unsafe {
|
13
|
+
assert!(!s.is_null());
|
14
|
+
|
15
|
+
CStr::from_ptr(s)
|
16
|
+
};
|
17
|
+
|
18
|
+
lazy_static! {
|
19
|
+
// # Match escaped control characters
|
20
|
+
static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|\##|\#})").unwrap();
|
21
|
+
|
22
|
+
// # Match ##...## or {#...#} control blocks.
|
23
|
+
static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
|
24
|
+
|
25
|
+
// Match any character in the block of Brahmic scripts
|
26
|
+
// between Devanagari and Malayalam.
|
27
|
+
static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
|
28
|
+
|
29
|
+
// Match on special Roman characters
|
30
|
+
static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
|
31
|
+
|
32
|
+
// Match on Kolkata-specific Roman characters
|
33
|
+
static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
|
34
|
+
|
35
|
+
// Match on ITRANS-only
|
36
|
+
static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
|
37
|
+
|
38
|
+
// Match on SLP1-only characters and bigrams
|
39
|
+
static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
|
40
|
+
|
41
|
+
// Match on Velthuis-only characters
|
42
|
+
static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
|
43
|
+
|
44
|
+
// Match on chars shared by ITRANS and Velthuis
|
45
|
+
static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
|
46
|
+
|
47
|
+
// Match on characters available in Harvard-Kyoto
|
48
|
+
static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
|
49
|
+
}
|
50
|
+
|
51
|
+
let r_replaced_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(c_str.to_str().unwrap(), "");
|
52
|
+
let r_str = &RE_CONTROL_BLOCK.replace_all(r_replaced_str, "");
|
53
|
+
|
54
|
+
// Brahmic schemes are all within a specific range of code points.
|
55
|
+
let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
|
56
|
+
if brahmic_match != None {
|
57
|
+
let brahmic_match = brahmic_match.unwrap();
|
58
|
+
let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as u32;
|
59
|
+
|
60
|
+
if brahmic_codepoint < 0x0980 {
|
61
|
+
return 1;
|
62
|
+
} else if brahmic_codepoint < 0x0A00 {
|
63
|
+
return 2;
|
64
|
+
} else if brahmic_codepoint < 0x0A80 {
|
65
|
+
return 3;
|
66
|
+
} else if brahmic_codepoint < 0x0B00 {
|
67
|
+
return 4;
|
68
|
+
} else if brahmic_codepoint < 0x0B80 {
|
69
|
+
return 5;
|
70
|
+
} else if brahmic_codepoint < 0x0C00 {
|
71
|
+
return 6;
|
72
|
+
} else if brahmic_codepoint < 0x0C80 {
|
73
|
+
return 7;
|
74
|
+
} else if brahmic_codepoint < 0x0D00 {
|
75
|
+
return 8;
|
76
|
+
} else {
|
77
|
+
return 9;
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
// Romanizations
|
82
|
+
if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
|
83
|
+
if RE_KOLKATA_ONLY.is_match(r_str) {
|
84
|
+
return 11;
|
85
|
+
} else {
|
86
|
+
return 10;
|
87
|
+
}
|
88
|
+
} else if RE_ITRANS_ONLY.is_match(r_str) {
|
89
|
+
return 12;
|
90
|
+
} else if RE_SLP1_ONLY.is_match(r_str) {
|
91
|
+
return 13;
|
92
|
+
} else if RE_VELTHUIS_ONLY.is_match(r_str) {
|
93
|
+
return 14;
|
94
|
+
} else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
|
95
|
+
return 12;
|
96
|
+
} else if RE_HARVARD_KYOTO.is_match(r_str) {
|
97
|
+
return 15;
|
98
|
+
}
|
99
|
+
return 0;
|
100
|
+
}
|
data/sanscript.gemspec
CHANGED
@@ -18,6 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.bindir = "exe"
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
|
+
spec.extensions = Dir["rust/extconf.rb"]
|
21
22
|
|
22
23
|
spec.required_ruby_version = "~> 2.2"
|
23
24
|
|
@@ -25,9 +26,12 @@ Gem::Specification.new do |spec|
|
|
25
26
|
spec.add_development_dependency "rake", "~> 11.2"
|
26
27
|
spec.add_development_dependency "rspec", "~> 3.5"
|
27
28
|
spec.add_development_dependency "codeclimate-test-reporter", "~> 0.6"
|
29
|
+
spec.add_development_dependency "rubocop", "~> 0.41"
|
30
|
+
spec.add_development_dependency "rubocop-rspec", "~> 1.5"
|
28
31
|
spec.add_development_dependency "pry", "~> 0.10"
|
29
32
|
spec.add_development_dependency "benchmark-ips", "~> 2.6"
|
30
33
|
spec.add_development_dependency "yard", "~> 0.9"
|
31
34
|
|
32
|
-
spec.add_runtime_dependency "ragabash", "~> 0.
|
35
|
+
spec.add_runtime_dependency "ragabash", "~> 0.2"
|
36
|
+
spec.add_runtime_dependency "ffi"
|
33
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Bellefleur
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,34 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.6'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.41'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.41'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop-rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.5'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.5'
|
69
97
|
- !ruby/object:Gem::Dependency
|
70
98
|
name: pry
|
71
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -114,19 +142,34 @@ dependencies:
|
|
114
142
|
requirements:
|
115
143
|
- - "~>"
|
116
144
|
- !ruby/object:Gem::Version
|
117
|
-
version: '0.
|
145
|
+
version: '0.2'
|
118
146
|
type: :runtime
|
119
147
|
prerelease: false
|
120
148
|
version_requirements: !ruby/object:Gem::Requirement
|
121
149
|
requirements:
|
122
150
|
- - "~>"
|
123
151
|
- !ruby/object:Gem::Version
|
124
|
-
version: '0.
|
152
|
+
version: '0.2'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: ffi
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
type: :runtime
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - ">="
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0'
|
125
167
|
description:
|
126
168
|
email:
|
127
169
|
- nomoon@phoebus.ca
|
128
170
|
executables: []
|
129
|
-
extensions:
|
171
|
+
extensions:
|
172
|
+
- rust/extconf.rb
|
130
173
|
extra_rdoc_files: []
|
131
174
|
files:
|
132
175
|
- ".codeclimate.yml"
|
@@ -150,6 +193,10 @@ files:
|
|
150
193
|
- lib/sanscript/transliterate.rb
|
151
194
|
- lib/sanscript/transliterate/schemes.rb
|
152
195
|
- lib/sanscript/version.rb
|
196
|
+
- rust/Cargo.lock
|
197
|
+
- rust/Cargo.toml
|
198
|
+
- rust/extconf.rb
|
199
|
+
- rust/src/lib.rs
|
153
200
|
- sanscript.gemspec
|
154
201
|
homepage: https://github.com/ubcsanskrit/sanscript.rb
|
155
202
|
licenses:
|