sanscript 0.6.2 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 59fade123e6369dd8e65f40aef416b1f3ceb6246
4
- data.tar.gz: 4c010e1ca0999f576a8485d385e825fd3cd98a3e
3
+ metadata.gz: 78a9a4f83c7ebeabdf0a4ce0b02c0efbe2bb99bd
4
+ data.tar.gz: 1e16e165ab8587163be05d449301c182d70b8037
5
5
  SHA512:
6
- metadata.gz: 9d2fc3ef1c990703a8f4c5a2e69f6eb535dca84600b375cd030cbc8e1bb30baa2447c7de4d0634afb3303389dd24b5e3a144bce9ac6f8b883768473d8b8d5cc9
7
- data.tar.gz: 98a910e4a950ad13ae0501cb3eb9ca841a4f049a42ce46989e6f12ff7d2ad1105a5083b4ae4496878bd6f001b9f29fa1a7323833c42c7234cc23193a6edc82b5
6
+ metadata.gz: 0354c6bb4d13b5aca33b73febb968f72dfaa843a7c6231fc30482f66d04bd8d239250a2aa0f0dbc957e4fef35d46173800319cd8583e8433cb9a1127e192d519
7
+ data.tar.gz: f0131f094b3a4e88cb7bba19955c27f1337b4ea7f2d1ca419e5bb8cd61f2546b515f52c2f60b44df1bdc91494f5cd0cb3e9f7a52b4c39b680039db7a23ba6bfb
data/.gitignore CHANGED
@@ -10,3 +10,4 @@
10
10
  /lib/librusty_sanscript.*
11
11
  /target
12
12
  mkmf.log
13
+ Cargo.lock
@@ -3,12 +3,12 @@ language: ruby
3
3
  rvm:
4
4
  - 2.3.1
5
5
  - 2.2
6
- - 2.4.0-preview1
6
+ - 2.4.0-preview2
7
7
  matrix:
8
8
  allow_failures:
9
- - rvm: 2.4.0-preview1
9
+ - rvm: 2.4.0-preview2
10
10
 
11
- before_install: gem install bundler -v 1.12.5
11
+ before_install: gem install bundler -v 1.13.1
12
12
  script: bundle exec rake spec SPEC_OPTS="--format p"
13
13
  addons:
14
14
  code_climate:
data/Cargo.toml CHANGED
@@ -1,16 +1,13 @@
1
1
  [package]
2
2
  name = "rusty_sanscript"
3
- version = "0.2.0"
3
+ version = "0.3.0"
4
4
  authors = ["Tim Bellefleur <nomoon@phoebus.ca>"]
5
5
  publish = false
6
6
 
7
7
  [dependencies]
8
8
  lazy_static = "^0.2.1"
9
- ruru = "^0.7.8"
10
- unicode-normalization = "^0.1.2"
11
- regex = "^0.1.73"
12
-
13
- [package.metadata.thermite]
9
+ ruby-sys = "^0.2.13"
10
+ regex = "^0.1.77"
14
11
 
15
12
  [lib]
16
- crate-type = ["dylib"]
13
+ crate-type = ["cdylib"]
data/Rakefile CHANGED
@@ -1,25 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
  require "bundler/gem_tasks"
3
3
 
4
- # Add enhanced optional_build task into Thermite
5
4
  require "thermite/tasks"
6
- module Thermite
7
- class BetterTasks < Tasks
8
- def initialize(options = {})
9
- super
10
- desc "Run thermite:build task or download binaries, but skip without fail if unavailable."
11
- task "thermite:optional_build" do
12
- if cargo
13
- Rake::Task["thermite:build"].invoke
14
- elsif !download_binary
15
- puts "Rust and downloadable binaries are not available, skipping."
16
- end
17
- end
18
- end
19
- end
20
- end
21
- Thermite::BetterTasks.new
22
- task default: :"thermite:optional_build"
5
+ Thermite::Tasks.new(optional_rust_extension: true)
6
+ task default: :"thermite:build"
23
7
 
24
8
  # Ensure missing RSpec development dependency doesn't kill gem install.
25
9
  begin
@@ -2,6 +2,7 @@
2
2
  require "ragabash"
3
3
 
4
4
  require "sanscript/version"
5
+ require "sanscript/rust"
5
6
  require "sanscript/exceptions"
6
7
  require "sanscript/detect"
7
8
  require "sanscript/transliterate"
@@ -9,6 +10,14 @@ require "sanscript/benchmark"
9
10
 
10
11
  # Sanscript.rb detection/transliteration module for Sanskrit.
11
12
  module Sanscript
13
+ # :nocov:
14
+ if RUST_AVAILABLE && ENV["SANSCRIPT_NO_RUST"].nil?
15
+ rust_enable!
16
+ else
17
+ rust_disable!
18
+ end
19
+ # :nocov:
20
+
12
21
  module_function
13
22
 
14
23
  # Attempts to detect the encoding scheme of the provided string.
@@ -56,29 +65,4 @@ module Sanscript
56
65
  end
57
66
  Transliterate.transliterate(text, from, to, opts)
58
67
  end
59
-
60
- # Override
61
- # :nocov:
62
- begin
63
- require "fiddle"
64
- require "thermite/config"
65
-
66
- toplevel_dir = File.dirname(File.dirname(__FILE__))
67
- config = Thermite::Config.new(cargo_project_path: toplevel_dir, ruby_project_path: toplevel_dir)
68
- library = Fiddle.dlopen(config.ruby_extension_path)
69
- module ::RustySanscriptDetect; end # rubocop:disable Style/ClassAndModuleChildren
70
- func = Fiddle::Function.new(library["init_rusty_sanscript"],
71
- [], Fiddle::TYPE_VOIDP)
72
- func.call
73
- module Detect
74
- extend ::RustySanscriptDetect
75
- class << self
76
- alias detect_scheme rust_detect_scheme
77
- end
78
- end
79
- RUST_ENABLED = true
80
- rescue Fiddle::DLError
81
- RUST_ENABLED = false
82
- end
83
- # :nocov:
84
68
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
- #:nocov:
3
2
 
3
+ # :nocov:
4
4
  begin
5
5
  require "benchmark/ips"
6
6
  rescue LoadError
@@ -10,6 +10,7 @@ rescue LoadError
10
10
  end
11
11
  end
12
12
  end
13
+ # :nocov:
13
14
 
14
15
  module Sanscript
15
16
  # Benchmark/testing module.
@@ -17,7 +18,7 @@ module Sanscript
17
18
  module_function
18
19
 
19
20
  # Runs benchmark-ips test on detection methods.
20
- def detect!
21
+ def detect!(time = 2, warmup = 1)
21
22
  deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
22
23
  malayalam_string = "നാനാശാസ്ത്രസുഭാഷിതാമൃതരസൈഃ ശ്രോത്രോത്സവം കുര്വതാം യേഷാം യാന്തി ദിനാനി പണ്ഡിതജനവ്യായാമഖിന്നാത്മനാമ് തേഷാം ജന്മ ച ജീവിതം ച സുകൃതം തൈര് ഏവ ഭൂര് ഭൂഷിതാ ശേഷൈഹ് കിം പശുവദ് വിവേകരഹിതൈര് ഭൂഭാരഭൂതൈര് നരഃ"
23
24
  iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
@@ -25,21 +26,21 @@ module Sanscript
25
26
  hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
26
27
 
27
28
  ::Benchmark.ips do |x|
28
- x.config(time: 5, warmup: 1)
29
+ x.config(time: time, warmup: warmup)
29
30
  x.report("Detect Devanagari") do
30
- raise unless Sanscript::Detect.detect_scheme(deva_string) == :devanagari
31
+ Sanscript::Detect.detect_scheme(deva_string)
31
32
  end
32
33
  x.report("Detect Malayalam") do
33
- raise unless Sanscript::Detect.detect_scheme(malayalam_string) == :malayalam
34
+ Sanscript::Detect.detect_scheme(malayalam_string)
34
35
  end
35
36
  x.report("Detect IAST") do
36
- raise unless Sanscript::Detect.detect_scheme(iast_string) == :iast
37
+ Sanscript::Detect.detect_scheme(iast_string)
37
38
  end
38
39
  x.report("Detect SLP1") do
39
- raise unless Sanscript::Detect.detect_scheme(slp1_string) == :slp1
40
+ Sanscript::Detect.detect_scheme(slp1_string)
40
41
  end
41
42
  x.report("Detect HK") do
42
- raise unless Sanscript::Detect.detect_scheme(hk_string) == :hk
43
+ Sanscript::Detect.detect_scheme(hk_string)
43
44
  end
44
45
  x.compare!
45
46
  end
@@ -47,13 +48,13 @@ module Sanscript
47
48
  end
48
49
 
49
50
  # Runs benchmark-ips test on roman-source transliteration methods.
50
- def transliterate_roman!
51
+ def transliterate_roman!(time = 2, warmup = 1)
51
52
  iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
52
53
  slp1_string = "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"
53
54
  hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
54
55
 
55
56
  ::Benchmark.ips do |x|
56
- x.config(time: 3, warmup: 2)
57
+ x.config(time: time, warmup: warmup)
57
58
 
58
59
  x.report("IAST==>Devanagari") do
59
60
  Sanscript.transliterate(iast_string, :iast, :devanagari)
@@ -88,11 +89,11 @@ module Sanscript
88
89
  end
89
90
 
90
91
  # Runs benchmark-ips test on brahmic-source transliteration methods.
91
- def transliterate_brahmic!
92
+ def transliterate_brahmic!(time = 2, warmup = 1)
92
93
  deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
93
94
 
94
95
  ::Benchmark.ips do |x|
95
- x.config(time: 5, warmup: 2)
96
+ x.config(time: time, warmup: warmup)
96
97
  x.report("Devanagari==>IAST") do
97
98
  Sanscript.transliterate(deva_string, :devanagari, :iast)
98
99
  end
@@ -53,8 +53,9 @@ module Sanscript
53
53
  # Attempts to detect the encoding scheme of the provided string.
54
54
  #
55
55
  # Uses the most efficient implementation for your ruby version
56
- # (either {Ruby2x#ruby_detect_scheme} or {Ruby24#ruby_detect_scheme}) or
57
- # the Rust native extension if available.
56
+ # (either {Ruby2x#ruby_detect_scheme} or {Ruby24#ruby_detect_scheme})
57
+ # at first, which may be then overriden by the Rust native extension
58
+ # (see {Sanscript#rust_enable!} and {Sanscript#rust_disable!}})
58
59
  #
59
60
  # @param text [String] a string of Sanskrit text
60
61
  # @return [Symbol, nil] the Symbol of the scheme, or nil if no match
@@ -68,9 +69,13 @@ module Sanscript
68
69
  require "sanscript/detect/ruby2x"
69
70
  extend Ruby2x
70
71
  end
72
+ # :nocov:
71
73
  class << self
72
74
  alias detect_scheme ruby_detect_scheme
73
75
  end
74
- # :nocov:
76
+
77
+ if defined?(Rust) && Rust.instance_methods.include?(:rust_detect_scheme)
78
+ extend Rust
79
+ end
75
80
  end
76
81
  end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+ module Sanscript
3
+ begin
4
+ require "thermite/fiddle"
5
+
6
+ Thermite::Fiddle.load_module("init_rusty_sanscript",
7
+ cargo_project_path: GEM_ROOT,
8
+ ruby_project_path: GEM_ROOT)
9
+ #:nocov:#
10
+ RUST_AVAILABLE = true
11
+ rescue Fiddle::DLError
12
+ RUST_AVAILABLE = false
13
+ #:nocov:#
14
+ end
15
+
16
+ module_function
17
+
18
+ # @return [bool] the enabled status of the Rust extension
19
+ def rust_enabled?
20
+ @rust_enabled
21
+ end
22
+
23
+ # Turns on Rust extension, if available.
24
+ # @return [bool] the enabled status of the Rust extension
25
+ def rust_enable!
26
+ if RUST_AVAILABLE
27
+ Detect.module_eval do
28
+ class << self
29
+ alias_method :detect_scheme, :rust_detect_scheme
30
+ end
31
+ end
32
+ @rust_enabled = true
33
+ end
34
+ @rust_enabled
35
+ end
36
+
37
+ # Turns off Rust native extension.
38
+ # @return [bool] the enabled status of the Rust extension
39
+ def rust_disable!
40
+ Detect.module_eval do
41
+ class << self
42
+ alias_method :detect_scheme, :ruby_detect_scheme
43
+ end
44
+ end
45
+ @rust_enabled = false
46
+ end
47
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
  module Sanscript
3
3
  # The version number
4
- VERSION = "0.6.2"
4
+ VERSION = "0.7.0"
5
5
 
6
6
  GEM_ROOT = Pathname.new(File.realpath(File.join(__dir__, "..", "..")))
7
7
  private_constant :GEM_ROOT
@@ -22,16 +22,16 @@ Gem::Specification.new do |spec|
22
22
 
23
23
  spec.required_ruby_version = "~> 2.2"
24
24
 
25
- spec.add_development_dependency "bundler", "~> 1.12"
26
- spec.add_development_dependency "rake", "~> 11.2"
25
+ spec.add_development_dependency "bundler", "~> 1.13"
26
+ spec.add_development_dependency "rake", "~> 11.3"
27
27
  spec.add_development_dependency "rspec", "~> 3.5"
28
28
  spec.add_development_dependency "codeclimate-test-reporter", "~> 0.6"
29
- spec.add_development_dependency "rubocop", "~> 0.41"
30
- spec.add_development_dependency "rubocop-rspec", "~> 1.5"
29
+ spec.add_development_dependency "rubocop", "~> 0.43"
30
+ spec.add_development_dependency "rubocop-rspec", "~> 1.7"
31
31
  spec.add_development_dependency "pry", "~> 0.10"
32
- spec.add_development_dependency "benchmark-ips", "~> 2.6"
32
+ spec.add_development_dependency "benchmark-ips", "~> 2.7"
33
33
  spec.add_development_dependency "yard", "~> 0.9"
34
34
 
35
35
  spec.add_runtime_dependency "ragabash", "~> 0.2"
36
- spec.add_runtime_dependency "thermite", "~> 0.5"
36
+ spec.add_runtime_dependency "thermite", "~> 0.7"
37
37
  end
@@ -0,0 +1,96 @@
1
+ pub mod ruby;
2
+
3
+ use regex::Regex;
4
+
5
+ //
6
+ // Initialize all of the generic static variables
7
+ //
8
+ lazy_static! {
9
+ // Match escaped control characters
10
+ static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|##|#\})").unwrap();
11
+
12
+ // Match ##...## or {#...#} control blocks.
13
+ static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
14
+
15
+ // Match any character in the block of Brahmic scripts
16
+ // between Devanagari and Malayalam.
17
+ static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
18
+
19
+ // Match on special Roman characters
20
+ static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
21
+
22
+ // Match on Kolkata-specific Roman characters
23
+ static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
24
+
25
+ // Match on ITRANS-only
26
+ static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
27
+
28
+ // Match on SLP1-only characters and bigrams
29
+ static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
30
+
31
+ // Match on Velthuis-only characters
32
+ static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
33
+
34
+ // Match on chars shared by ITRANS and Velthuis
35
+ static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
36
+
37
+ // Match on characters available in Harvard-Kyoto
38
+ static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
39
+ }
40
+
41
+ //
42
+ // The function itself!
43
+ //
44
+ #[no_mangle]
45
+ pub extern fn detect_scheme(s: &str) -> usize {
46
+ // Clean-up string of control characters.
47
+ let r_str = &RE_CONTROL_BLOCK.replace_all(
48
+ &RE_ESCAPED_CONTROL_CHAR.replace_all(s, ""), "");
49
+
50
+ // Brahmic schemes are all within a specific range of code points.
51
+ let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
52
+ if brahmic_match != None {
53
+ let brahmic_match = brahmic_match.unwrap();
54
+ let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as usize;
55
+
56
+ if brahmic_codepoint < 0x0980 {
57
+ return 1; // Devanagari
58
+ } else if brahmic_codepoint < 0x0A00 {
59
+ return 2; // Bengali
60
+ } else if brahmic_codepoint < 0x0A80 {
61
+ return 3; // Gurmukhi
62
+ } else if brahmic_codepoint < 0x0B00 {
63
+ return 4; // Gujarati
64
+ } else if brahmic_codepoint < 0x0B80 {
65
+ return 5; // Oriya
66
+ } else if brahmic_codepoint < 0x0C00 {
67
+ return 6; // Tamil
68
+ } else if brahmic_codepoint < 0x0C80 {
69
+ return 7; // Telugu
70
+ } else if brahmic_codepoint < 0x0D00 {
71
+ return 8; // Kannada
72
+ } else {
73
+ return 9; // Malayalam
74
+ }
75
+ }
76
+
77
+ // Romanizations
78
+ if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
79
+ if RE_KOLKATA_ONLY.is_match(r_str) {
80
+ return 11; // Kolkata
81
+ } else {
82
+ return 10; // IAST
83
+ }
84
+ } else if RE_ITRANS_ONLY.is_match(r_str) {
85
+ return 12; // ITRANS
86
+ } else if RE_SLP1_ONLY.is_match(r_str) {
87
+ return 13; // SLP1
88
+ } else if RE_VELTHUIS_ONLY.is_match(r_str) {
89
+ return 14; // Velthuis
90
+ } else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
91
+ return 12; // ITRANS
92
+ } else if RE_HARVARD_KYOTO.is_match(r_str) {
93
+ return 15; // HK
94
+ }
95
+ return 0; // Unknown
96
+ }
@@ -0,0 +1,42 @@
1
+ use detect;
2
+
3
+ use rb;
4
+ use rb::{CallbackPtr, Value};
5
+
6
+ // Initialize all of the Ruby-specific static variables.
7
+ lazy_static! {
8
+ // Lookup table for Ruby Results
9
+ static ref RUBY_RESULTS: [Value; 16] = [
10
+ rb::RB_NIL, // 0
11
+ rb::str_to_sym("devanagari"), // 1
12
+ rb::str_to_sym("bengali") , // 2
13
+ rb::str_to_sym("gurmukhi"), // 3
14
+ rb::str_to_sym("gujarati"), // 4
15
+ rb::str_to_sym("oriya"), // 5
16
+ rb::str_to_sym("tamil"), // 6
17
+ rb::str_to_sym("telugu"), // 7
18
+ rb::str_to_sym("kannada"), // 8
19
+ rb::str_to_sym("malayalam"), // 9
20
+ rb::str_to_sym("iast"), // 10
21
+ rb::str_to_sym("kolkata"), // 11
22
+ rb::str_to_sym("itrans"), // 12
23
+ rb::str_to_sym("slp1"), // 13
24
+ rb::str_to_sym("velthuis"), // 14
25
+ rb::str_to_sym("hk") // 15
26
+ ];
27
+ }
28
+
29
+ fn rbstr_detect_scheme(_rself: Value, s: Value) -> Value {
30
+ let r_str = rb::rbstr_to_str(&s);
31
+ let result = detect::detect_scheme(r_str);
32
+ return RUBY_RESULTS[result];
33
+ }
34
+
35
+ #[no_mangle]
36
+ pub extern fn init_rusty_sanscript() {
37
+ let m_sanscript = rb::define_module("Sanscript");
38
+ let m_detect = rb::define_module_under(&m_sanscript, "Detect");
39
+ let m_rust = rb::define_module_under(&m_detect, "Rust");
40
+ rb::define_method(&m_rust, "rust_detect_scheme",
41
+ rbstr_detect_scheme as CallbackPtr, 1);
42
+ }
data/src/lib.rs CHANGED
@@ -1,122 +1,7 @@
1
1
  #[macro_use] extern crate lazy_static;
2
- #[macro_use] extern crate ruru;
3
- extern crate unicode_normalization;
2
+ extern crate ruby_sys;
4
3
  extern crate regex;
5
4
 
6
- use ruru::{AnyObject, Class, NilClass, Symbol, RString};
7
- use ruru::traits::Object;
8
- #[allow(unused_imports)] use unicode_normalization::UnicodeNormalization;
9
- use regex::Regex;
10
-
11
- methods! {
12
- Class,
13
- _itself,
14
- fn detect(s: RString) -> AnyObject {
15
- lazy_static! {
16
- // # Match escaped control characters
17
- static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|##|#\})").unwrap();
18
-
19
- // # Match ##...## or {#...#} control blocks.
20
- static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
21
-
22
- // Match any character in the block of Brahmic scripts
23
- // between Devanagari and Malayalam.
24
- static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
25
-
26
- // Match on special Roman characters
27
- static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
28
-
29
- // Match on Kolkata-specific Roman characters
30
- static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
31
-
32
- // Match on ITRANS-only
33
- static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
34
-
35
- // Match on SLP1-only characters and bigrams
36
- static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
37
-
38
- // Match on Velthuis-only characters
39
- static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
40
-
41
- // Match on chars shared by ITRANS and Velthuis
42
- static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
43
-
44
- // Match on characters available in Harvard-Kyoto
45
- static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
46
-
47
- static ref _DEVANAGARI: Symbol = Symbol::new("devanagari");
48
- static ref _BENGALI: Symbol = Symbol::new("bengali");
49
- static ref _GURMUKHI: Symbol = Symbol::new("gurmukhi");
50
- static ref _GUJARATI: Symbol = Symbol::new("gujarati");
51
- static ref _ORIYA: Symbol = Symbol::new("oriya");
52
- static ref _TAMIL: Symbol = Symbol::new("tamil");
53
- static ref _TELUGU: Symbol = Symbol::new("telugu");
54
- static ref _KANNADA: Symbol = Symbol::new("kannada");
55
- static ref _MALAYALAM: Symbol = Symbol::new("malayalam");
56
- static ref _IAST: Symbol = Symbol::new("iast");
57
- static ref _KOLKATA: Symbol = Symbol::new("kolkata");
58
- static ref _ITRANS: Symbol = Symbol::new("itrans");
59
- static ref _SLP1: Symbol = Symbol::new("slp1");
60
- static ref _VELTHUIS: Symbol = Symbol::new("velthuis");
61
- static ref _HK: Symbol = Symbol::new("hk");
62
- static ref _NIL: NilClass = NilClass::new();
63
- }
64
-
65
- let r_replaced_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(&s.to_string(), "");
66
- let r_str = &RE_CONTROL_BLOCK.replace_all(r_replaced_str, "");
67
-
68
- // Brahmic schemes are all within a specific range of code points.
69
- let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
70
- if brahmic_match != None {
71
- let brahmic_match = brahmic_match.unwrap();
72
- let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as u32;
73
-
74
- if brahmic_codepoint < 0x0980 {
75
- return _DEVANAGARI.to_any_object();
76
- } else if brahmic_codepoint < 0x0A00 {
77
- return _BENGALI.to_any_object();
78
- } else if brahmic_codepoint < 0x0A80 {
79
- return _GURMUKHI.to_any_object();
80
- } else if brahmic_codepoint < 0x0B00 {
81
- return _GUJARATI.to_any_object();
82
- } else if brahmic_codepoint < 0x0B80 {
83
- return _ORIYA.to_any_object();
84
- } else if brahmic_codepoint < 0x0C00 {
85
- return _TAMIL.to_any_object();
86
- } else if brahmic_codepoint < 0x0C80 {
87
- return _TELUGU.to_any_object();
88
- } else if brahmic_codepoint < 0x0D00 {
89
- return _KANNADA.to_any_object();
90
- } else {
91
- return _MALAYALAM.to_any_object();
92
- }
93
- }
94
-
95
- // Romanizations
96
- if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
97
- if RE_KOLKATA_ONLY.is_match(r_str) {
98
- return _KOLKATA.to_any_object();
99
- } else {
100
- return _IAST.to_any_object();
101
- }
102
- } else if RE_ITRANS_ONLY.is_match(r_str) {
103
- return _ITRANS.to_any_object();
104
- } else if RE_SLP1_ONLY.is_match(r_str) {
105
- return _SLP1.to_any_object();
106
- } else if RE_VELTHUIS_ONLY.is_match(r_str) {
107
- return _VELTHUIS.to_any_object();
108
- } else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
109
- return _ITRANS.to_any_object();
110
- } else if RE_HARVARD_KYOTO.is_match(r_str) {
111
- return _HK.to_any_object();
112
- }
113
- return _NIL.to_any_object();
114
- }
115
- }
116
-
117
- #[no_mangle]
118
- pub extern fn init_rusty_sanscript() {
119
- Class::from_existing("RustySanscriptDetect").define(|itself| {
120
- itself.def("rust_detect_scheme", detect);
121
- });
122
- }
5
+ mod rb;
6
+ // Exports a Sanscript::Detect::Rust module
7
+ pub mod detect;
@@ -0,0 +1,48 @@
1
+ use std::ffi::{CStr, CString};
2
+
3
+ use ruby_sys::{class, string, symbol, util};
4
+ use ruby_sys::types::c_char;
5
+ use ruby_sys::value::RubySpecialConsts::Nil;
6
+
7
+ pub use ruby_sys::types::{CallbackPtr, Value};
8
+ pub const RB_NIL: Value = Value { value: Nil as usize };
9
+
10
+ //
11
+ // Helper functions for dealing with Ruby and CStrings
12
+ //
13
+
14
+ #[inline(always)]
15
+ fn str_to_cstrp(s: &str) -> *const c_char {
16
+ CString::new(s).unwrap().as_ptr()
17
+ }
18
+
19
+ #[inline]
20
+ pub fn rbstr_to_str<'a>(s: *const Value) -> &'a str {
21
+ unsafe {
22
+ let c_strp = string::rb_string_value_cstr(s);
23
+ CStr::from_ptr(c_strp).to_str().unwrap()
24
+ }
25
+ }
26
+
27
+ pub fn str_to_sym(s: &str) -> Value {
28
+ let c_strp = str_to_cstrp(s);
29
+ unsafe {
30
+ let id = util::rb_intern(c_strp);
31
+ symbol::rb_id2sym(id)
32
+ }
33
+ }
34
+
35
+ pub fn define_module(name: &str) -> Value {
36
+ let c_strp = str_to_cstrp(name);
37
+ unsafe { class::rb_define_module(c_strp) }
38
+ }
39
+
40
+ pub fn define_module_under(parent: &Value, name: &str) -> Value {
41
+ let c_strp = str_to_cstrp(name);
42
+ unsafe { class::rb_define_module_under(*parent, c_strp) }
43
+ }
44
+
45
+ pub fn define_method(module: &Value, name: &str, method: CallbackPtr, argc: i32) {
46
+ let c_strp = str_to_cstrp(name);
47
+ unsafe { class::rb_define_method(*module, c_strp, method, argc) }
48
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanscript
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Bellefleur
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-08-16 00:00:00.000000000 Z
11
+ date: 2016-09-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.12'
19
+ version: '1.13'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.12'
26
+ version: '1.13'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '11.2'
33
+ version: '11.3'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '11.2'
40
+ version: '11.3'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -72,28 +72,28 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0.41'
75
+ version: '0.43'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0.41'
82
+ version: '0.43'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: rubocop-rspec
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '1.5'
89
+ version: '1.7'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '1.5'
96
+ version: '1.7'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: pry
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -114,14 +114,14 @@ dependencies:
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: '2.6'
117
+ version: '2.7'
118
118
  type: :development
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: '2.6'
124
+ version: '2.7'
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: yard
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -156,14 +156,14 @@ dependencies:
156
156
  requirements:
157
157
  - - "~>"
158
158
  - !ruby/object:Gem::Version
159
- version: '0.5'
159
+ version: '0.7'
160
160
  type: :runtime
161
161
  prerelease: false
162
162
  version_requirements: !ruby/object:Gem::Requirement
163
163
  requirements:
164
164
  - - "~>"
165
165
  - !ruby/object:Gem::Version
166
- version: '0.5'
166
+ version: '0.7'
167
167
  description:
168
168
  email:
169
169
  - nomoon@phoebus.ca
@@ -178,7 +178,6 @@ files:
178
178
  - ".rubocop.yml"
179
179
  - ".travis.yml"
180
180
  - CODE_OF_CONDUCT.md
181
- - Cargo.lock
182
181
  - Cargo.toml
183
182
  - Gemfile
184
183
  - LICENSE.txt
@@ -192,11 +191,15 @@ files:
192
191
  - lib/sanscript/detect/ruby24.rb
193
192
  - lib/sanscript/detect/ruby2x.rb
194
193
  - lib/sanscript/exceptions.rb
194
+ - lib/sanscript/rust.rb
195
195
  - lib/sanscript/transliterate.rb
196
196
  - lib/sanscript/transliterate/schemes.rb
197
197
  - lib/sanscript/version.rb
198
198
  - sanscript.gemspec
199
+ - src/detect/mod.rs
200
+ - src/detect/ruby.rs
199
201
  - src/lib.rs
202
+ - src/rb.rs
200
203
  homepage: https://github.com/ubcsanskrit/sanscript.rb
201
204
  licenses:
202
205
  - MIT
data/Cargo.lock DELETED
@@ -1,115 +0,0 @@
1
- [root]
2
- name = "rusty_sanscript"
3
- version = "0.2.0"
4
- dependencies = [
5
- "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
6
- "regex 0.1.73 (registry+https://github.com/rust-lang/crates.io-index)",
7
- "ruru 0.7.8 (registry+https://github.com/rust-lang/crates.io-index)",
8
- "unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
9
- ]
10
-
11
- [[package]]
12
- name = "aho-corasick"
13
- version = "0.5.2"
14
- source = "registry+https://github.com/rust-lang/crates.io-index"
15
- dependencies = [
16
- "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
17
- ]
18
-
19
- [[package]]
20
- name = "kernel32-sys"
21
- version = "0.2.2"
22
- source = "registry+https://github.com/rust-lang/crates.io-index"
23
- dependencies = [
24
- "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
25
- "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
26
- ]
27
-
28
- [[package]]
29
- name = "lazy_static"
30
- version = "0.2.1"
31
- source = "registry+https://github.com/rust-lang/crates.io-index"
32
-
33
- [[package]]
34
- name = "libc"
35
- version = "0.2.15"
36
- source = "registry+https://github.com/rust-lang/crates.io-index"
37
-
38
- [[package]]
39
- name = "memchr"
40
- version = "0.1.11"
41
- source = "registry+https://github.com/rust-lang/crates.io-index"
42
- dependencies = [
43
- "libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
44
- ]
45
-
46
- [[package]]
47
- name = "regex"
48
- version = "0.1.73"
49
- source = "registry+https://github.com/rust-lang/crates.io-index"
50
- dependencies = [
51
- "aho-corasick 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
52
- "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
53
- "regex-syntax 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
54
- "thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
55
- "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
56
- ]
57
-
58
- [[package]]
59
- name = "regex-syntax"
60
- version = "0.3.4"
61
- source = "registry+https://github.com/rust-lang/crates.io-index"
62
-
63
- [[package]]
64
- name = "ruby-sys"
65
- version = "0.2.6"
66
- source = "registry+https://github.com/rust-lang/crates.io-index"
67
- dependencies = [
68
- "libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
69
- ]
70
-
71
- [[package]]
72
- name = "ruru"
73
- version = "0.7.8"
74
- source = "registry+https://github.com/rust-lang/crates.io-index"
75
- dependencies = [
76
- "ruby-sys 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
77
- ]
78
-
79
- [[package]]
80
- name = "thread-id"
81
- version = "2.0.0"
82
- source = "registry+https://github.com/rust-lang/crates.io-index"
83
- dependencies = [
84
- "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
85
- "libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
86
- ]
87
-
88
- [[package]]
89
- name = "thread_local"
90
- version = "0.2.6"
91
- source = "registry+https://github.com/rust-lang/crates.io-index"
92
- dependencies = [
93
- "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
94
- ]
95
-
96
- [[package]]
97
- name = "unicode-normalization"
98
- version = "0.1.2"
99
- source = "registry+https://github.com/rust-lang/crates.io-index"
100
-
101
- [[package]]
102
- name = "utf8-ranges"
103
- version = "0.1.3"
104
- source = "registry+https://github.com/rust-lang/crates.io-index"
105
-
106
- [[package]]
107
- name = "winapi"
108
- version = "0.2.8"
109
- source = "registry+https://github.com/rust-lang/crates.io-index"
110
-
111
- [[package]]
112
- name = "winapi-build"
113
- version = "0.1.1"
114
- source = "registry+https://github.com/rust-lang/crates.io-index"
115
-