sanscript 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 59fade123e6369dd8e65f40aef416b1f3ceb6246
4
- data.tar.gz: 4c010e1ca0999f576a8485d385e825fd3cd98a3e
3
+ metadata.gz: 78a9a4f83c7ebeabdf0a4ce0b02c0efbe2bb99bd
4
+ data.tar.gz: 1e16e165ab8587163be05d449301c182d70b8037
5
5
  SHA512:
6
- metadata.gz: 9d2fc3ef1c990703a8f4c5a2e69f6eb535dca84600b375cd030cbc8e1bb30baa2447c7de4d0634afb3303389dd24b5e3a144bce9ac6f8b883768473d8b8d5cc9
7
- data.tar.gz: 98a910e4a950ad13ae0501cb3eb9ca841a4f049a42ce46989e6f12ff7d2ad1105a5083b4ae4496878bd6f001b9f29fa1a7323833c42c7234cc23193a6edc82b5
6
+ metadata.gz: 0354c6bb4d13b5aca33b73febb968f72dfaa843a7c6231fc30482f66d04bd8d239250a2aa0f0dbc957e4fef35d46173800319cd8583e8433cb9a1127e192d519
7
+ data.tar.gz: f0131f094b3a4e88cb7bba19955c27f1337b4ea7f2d1ca419e5bb8cd61f2546b515f52c2f60b44df1bdc91494f5cd0cb3e9f7a52b4c39b680039db7a23ba6bfb
data/.gitignore CHANGED
@@ -10,3 +10,4 @@
10
10
  /lib/librusty_sanscript.*
11
11
  /target
12
12
  mkmf.log
13
+ Cargo.lock
@@ -3,12 +3,12 @@ language: ruby
3
3
  rvm:
4
4
  - 2.3.1
5
5
  - 2.2
6
- - 2.4.0-preview1
6
+ - 2.4.0-preview2
7
7
  matrix:
8
8
  allow_failures:
9
- - rvm: 2.4.0-preview1
9
+ - rvm: 2.4.0-preview2
10
10
 
11
- before_install: gem install bundler -v 1.12.5
11
+ before_install: gem install bundler -v 1.13.1
12
12
  script: bundle exec rake spec SPEC_OPTS="--format p"
13
13
  addons:
14
14
  code_climate:
data/Cargo.toml CHANGED
@@ -1,16 +1,13 @@
1
1
  [package]
2
2
  name = "rusty_sanscript"
3
- version = "0.2.0"
3
+ version = "0.3.0"
4
4
  authors = ["Tim Bellefleur <nomoon@phoebus.ca>"]
5
5
  publish = false
6
6
 
7
7
  [dependencies]
8
8
  lazy_static = "^0.2.1"
9
- ruru = "^0.7.8"
10
- unicode-normalization = "^0.1.2"
11
- regex = "^0.1.73"
12
-
13
- [package.metadata.thermite]
9
+ ruby-sys = "^0.2.13"
10
+ regex = "^0.1.77"
14
11
 
15
12
  [lib]
16
- crate-type = ["dylib"]
13
+ crate-type = ["cdylib"]
data/Rakefile CHANGED
@@ -1,25 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
  require "bundler/gem_tasks"
3
3
 
4
- # Add enhanced optional_build task into Thermite
5
4
  require "thermite/tasks"
6
- module Thermite
7
- class BetterTasks < Tasks
8
- def initialize(options = {})
9
- super
10
- desc "Run thermite:build task or download binaries, but skip without fail if unavailable."
11
- task "thermite:optional_build" do
12
- if cargo
13
- Rake::Task["thermite:build"].invoke
14
- elsif !download_binary
15
- puts "Rust and downloadable binaries are not available, skipping."
16
- end
17
- end
18
- end
19
- end
20
- end
21
- Thermite::BetterTasks.new
22
- task default: :"thermite:optional_build"
5
+ Thermite::Tasks.new(optional_rust_extension: true)
6
+ task default: :"thermite:build"
23
7
 
24
8
  # Ensure missing RSpec development dependency doesn't kill gem install.
25
9
  begin
@@ -2,6 +2,7 @@
2
2
  require "ragabash"
3
3
 
4
4
  require "sanscript/version"
5
+ require "sanscript/rust"
5
6
  require "sanscript/exceptions"
6
7
  require "sanscript/detect"
7
8
  require "sanscript/transliterate"
@@ -9,6 +10,14 @@ require "sanscript/benchmark"
9
10
 
10
11
  # Sanscript.rb detection/transliteration module for Sanskrit.
11
12
  module Sanscript
13
+ # :nocov:
14
+ if RUST_AVAILABLE && ENV["SANSCRIPT_NO_RUST"].nil?
15
+ rust_enable!
16
+ else
17
+ rust_disable!
18
+ end
19
+ # :nocov:
20
+
12
21
  module_function
13
22
 
14
23
  # Attempts to detect the encoding scheme of the provided string.
@@ -56,29 +65,4 @@ module Sanscript
56
65
  end
57
66
  Transliterate.transliterate(text, from, to, opts)
58
67
  end
59
-
60
- # Override
61
- # :nocov:
62
- begin
63
- require "fiddle"
64
- require "thermite/config"
65
-
66
- toplevel_dir = File.dirname(File.dirname(__FILE__))
67
- config = Thermite::Config.new(cargo_project_path: toplevel_dir, ruby_project_path: toplevel_dir)
68
- library = Fiddle.dlopen(config.ruby_extension_path)
69
- module ::RustySanscriptDetect; end # rubocop:disable Style/ClassAndModuleChildren
70
- func = Fiddle::Function.new(library["init_rusty_sanscript"],
71
- [], Fiddle::TYPE_VOIDP)
72
- func.call
73
- module Detect
74
- extend ::RustySanscriptDetect
75
- class << self
76
- alias detect_scheme rust_detect_scheme
77
- end
78
- end
79
- RUST_ENABLED = true
80
- rescue Fiddle::DLError
81
- RUST_ENABLED = false
82
- end
83
- # :nocov:
84
68
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
- #:nocov:
3
2
 
3
+ # :nocov:
4
4
  begin
5
5
  require "benchmark/ips"
6
6
  rescue LoadError
@@ -10,6 +10,7 @@ rescue LoadError
10
10
  end
11
11
  end
12
12
  end
13
+ # :nocov:
13
14
 
14
15
  module Sanscript
15
16
  # Benchmark/testing module.
@@ -17,7 +18,7 @@ module Sanscript
17
18
  module_function
18
19
 
19
20
  # Runs benchmark-ips test on detection methods.
20
- def detect!
21
+ def detect!(time = 2, warmup = 1)
21
22
  deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
22
23
  malayalam_string = "നാനാശാസ്ത്രസുഭാഷിതാമൃതരസൈഃ ശ്രോത്രോത്സവം കുര്വതാം യേഷാം യാന്തി ദിനാനി പണ്ഡിതജനവ്യായാമഖിന്നാത്മനാമ് തേഷാം ജന്മ ച ജീവിതം ച സുകൃതം തൈര് ഏവ ഭൂര് ഭൂഷിതാ ശേഷൈഹ് കിം പശുവദ് വിവേകരഹിതൈര് ഭൂഭാരഭൂതൈര് നരഃ"
23
24
  iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
@@ -25,21 +26,21 @@ module Sanscript
25
26
  hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
26
27
 
27
28
  ::Benchmark.ips do |x|
28
- x.config(time: 5, warmup: 1)
29
+ x.config(time: time, warmup: warmup)
29
30
  x.report("Detect Devanagari") do
30
- raise unless Sanscript::Detect.detect_scheme(deva_string) == :devanagari
31
+ Sanscript::Detect.detect_scheme(deva_string)
31
32
  end
32
33
  x.report("Detect Malayalam") do
33
- raise unless Sanscript::Detect.detect_scheme(malayalam_string) == :malayalam
34
+ Sanscript::Detect.detect_scheme(malayalam_string)
34
35
  end
35
36
  x.report("Detect IAST") do
36
- raise unless Sanscript::Detect.detect_scheme(iast_string) == :iast
37
+ Sanscript::Detect.detect_scheme(iast_string)
37
38
  end
38
39
  x.report("Detect SLP1") do
39
- raise unless Sanscript::Detect.detect_scheme(slp1_string) == :slp1
40
+ Sanscript::Detect.detect_scheme(slp1_string)
40
41
  end
41
42
  x.report("Detect HK") do
42
- raise unless Sanscript::Detect.detect_scheme(hk_string) == :hk
43
+ Sanscript::Detect.detect_scheme(hk_string)
43
44
  end
44
45
  x.compare!
45
46
  end
@@ -47,13 +48,13 @@ module Sanscript
47
48
  end
48
49
 
49
50
  # Runs benchmark-ips test on roman-source transliteration methods.
50
- def transliterate_roman!
51
+ def transliterate_roman!(time = 2, warmup = 1)
51
52
  iast_string = "nānāśāstrasubhāṣitāmṛtarasaiḥ śrotrotsavaṃ kurvatāṃ yeṣāṃ yānti dināni paṇḍitajanavyāyāmakhinnātmanām teṣāṃ janma ca jīvitaṃ ca sukṛtaṃ tair eva bhūr bhūṣitā śeṣaih kiṃ paśuvad vivekarahitair bhūbhārabhūtair naraḥ"
52
53
  slp1_string = "nAnASAstrasuBAzitAmftarasEH SrotrotsavaM kurvatAM yezAM yAnti dinAni paRqitajanavyAyAmaKinnAtmanAm tezAM janma ca jIvitaM ca sukftaM tEr eva BUr BUzitA SezEh kiM paSuvad vivekarahitEr BUBAraBUtEr naraH"
53
54
  hk_string = "nAnAzAstrasubhASitAmRtarasaiH zrotrotsavaM kurvatAM yeSAM yAnti dinAni paNDitajanavyAyAmakhinnAtmanAm teSAM janma ca jIvitaM ca sukRtaM tair eva bhUr bhUSitA zeSaih kiM pazuvad vivekarahitair bhUbhArabhUtair naraH"
54
55
 
55
56
  ::Benchmark.ips do |x|
56
- x.config(time: 3, warmup: 2)
57
+ x.config(time: time, warmup: warmup)
57
58
 
58
59
  x.report("IAST==>Devanagari") do
59
60
  Sanscript.transliterate(iast_string, :iast, :devanagari)
@@ -88,11 +89,11 @@ module Sanscript
88
89
  end
89
90
 
90
91
  # Runs benchmark-ips test on brahmic-source transliteration methods.
91
- def transliterate_brahmic!
92
+ def transliterate_brahmic!(time = 2, warmup = 1)
92
93
  deva_string = "नानाशास्त्रसुभाषितामृतरसैः श्रोत्रोत्सवं कुर्वतां येषां यान्ति दिनानि पण्डितजनव्यायामखिन्नात्मनाम् तेषां जन्म च जीवितं च सुकृतं तैर् एव भूर् भूषिता शेषैह् किं पशुवद् विवेकरहितैर् भूभारभूतैर् नरः"
93
94
 
94
95
  ::Benchmark.ips do |x|
95
- x.config(time: 5, warmup: 2)
96
+ x.config(time: time, warmup: warmup)
96
97
  x.report("Devanagari==>IAST") do
97
98
  Sanscript.transliterate(deva_string, :devanagari, :iast)
98
99
  end
@@ -53,8 +53,9 @@ module Sanscript
53
53
  # Attempts to detect the encoding scheme of the provided string.
54
54
  #
55
55
  # Uses the most efficient implementation for your ruby version
56
- # (either {Ruby2x#ruby_detect_scheme} or {Ruby24#ruby_detect_scheme}) or
57
- # the Rust native extension if available.
56
+ # (either {Ruby2x#ruby_detect_scheme} or {Ruby24#ruby_detect_scheme})
57
+ # at first, which may be then overriden by the Rust native extension
58
+ # (see {Sanscript#rust_enable!} and {Sanscript#rust_disable!}})
58
59
  #
59
60
  # @param text [String] a string of Sanskrit text
60
61
  # @return [Symbol, nil] the Symbol of the scheme, or nil if no match
@@ -68,9 +69,13 @@ module Sanscript
68
69
  require "sanscript/detect/ruby2x"
69
70
  extend Ruby2x
70
71
  end
72
+ # :nocov:
71
73
  class << self
72
74
  alias detect_scheme ruby_detect_scheme
73
75
  end
74
- # :nocov:
76
+
77
+ if defined?(Rust) && Rust.instance_methods.include?(:rust_detect_scheme)
78
+ extend Rust
79
+ end
75
80
  end
76
81
  end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+ module Sanscript
3
+ begin
4
+ require "thermite/fiddle"
5
+
6
+ Thermite::Fiddle.load_module("init_rusty_sanscript",
7
+ cargo_project_path: GEM_ROOT,
8
+ ruby_project_path: GEM_ROOT)
9
+ #:nocov:#
10
+ RUST_AVAILABLE = true
11
+ rescue Fiddle::DLError
12
+ RUST_AVAILABLE = false
13
+ #:nocov:#
14
+ end
15
+
16
+ module_function
17
+
18
+ # @return [bool] the enabled status of the Rust extension
19
+ def rust_enabled?
20
+ @rust_enabled
21
+ end
22
+
23
+ # Turns on Rust extension, if available.
24
+ # @return [bool] the enabled status of the Rust extension
25
+ def rust_enable!
26
+ if RUST_AVAILABLE
27
+ Detect.module_eval do
28
+ class << self
29
+ alias_method :detect_scheme, :rust_detect_scheme
30
+ end
31
+ end
32
+ @rust_enabled = true
33
+ end
34
+ @rust_enabled
35
+ end
36
+
37
+ # Turns off Rust native extension.
38
+ # @return [bool] the enabled status of the Rust extension
39
+ def rust_disable!
40
+ Detect.module_eval do
41
+ class << self
42
+ alias_method :detect_scheme, :ruby_detect_scheme
43
+ end
44
+ end
45
+ @rust_enabled = false
46
+ end
47
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
  module Sanscript
3
3
  # The version number
4
- VERSION = "0.6.2"
4
+ VERSION = "0.7.0"
5
5
 
6
6
  GEM_ROOT = Pathname.new(File.realpath(File.join(__dir__, "..", "..")))
7
7
  private_constant :GEM_ROOT
@@ -22,16 +22,16 @@ Gem::Specification.new do |spec|
22
22
 
23
23
  spec.required_ruby_version = "~> 2.2"
24
24
 
25
- spec.add_development_dependency "bundler", "~> 1.12"
26
- spec.add_development_dependency "rake", "~> 11.2"
25
+ spec.add_development_dependency "bundler", "~> 1.13"
26
+ spec.add_development_dependency "rake", "~> 11.3"
27
27
  spec.add_development_dependency "rspec", "~> 3.5"
28
28
  spec.add_development_dependency "codeclimate-test-reporter", "~> 0.6"
29
- spec.add_development_dependency "rubocop", "~> 0.41"
30
- spec.add_development_dependency "rubocop-rspec", "~> 1.5"
29
+ spec.add_development_dependency "rubocop", "~> 0.43"
30
+ spec.add_development_dependency "rubocop-rspec", "~> 1.7"
31
31
  spec.add_development_dependency "pry", "~> 0.10"
32
- spec.add_development_dependency "benchmark-ips", "~> 2.6"
32
+ spec.add_development_dependency "benchmark-ips", "~> 2.7"
33
33
  spec.add_development_dependency "yard", "~> 0.9"
34
34
 
35
35
  spec.add_runtime_dependency "ragabash", "~> 0.2"
36
- spec.add_runtime_dependency "thermite", "~> 0.5"
36
+ spec.add_runtime_dependency "thermite", "~> 0.7"
37
37
  end
@@ -0,0 +1,96 @@
1
+ pub mod ruby;
2
+
3
+ use regex::Regex;
4
+
5
+ //
6
+ // Initialize all of the generic static variables
7
+ //
8
+ lazy_static! {
9
+ // Match escaped control characters
10
+ static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|##|#\})").unwrap();
11
+
12
+ // Match ##...## or {#...#} control blocks.
13
+ static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
14
+
15
+ // Match any character in the block of Brahmic scripts
16
+ // between Devanagari and Malayalam.
17
+ static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
18
+
19
+ // Match on special Roman characters
20
+ static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
21
+
22
+ // Match on Kolkata-specific Roman characters
23
+ static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
24
+
25
+ // Match on ITRANS-only
26
+ static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
27
+
28
+ // Match on SLP1-only characters and bigrams
29
+ static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
30
+
31
+ // Match on Velthuis-only characters
32
+ static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
33
+
34
+ // Match on chars shared by ITRANS and Velthuis
35
+ static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
36
+
37
+ // Match on characters available in Harvard-Kyoto
38
+ static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
39
+ }
40
+
41
+ //
42
+ // The function itself!
43
+ //
44
+ #[no_mangle]
45
+ pub extern fn detect_scheme(s: &str) -> usize {
46
+ // Clean-up string of control characters.
47
+ let r_str = &RE_CONTROL_BLOCK.replace_all(
48
+ &RE_ESCAPED_CONTROL_CHAR.replace_all(s, ""), "");
49
+
50
+ // Brahmic schemes are all within a specific range of code points.
51
+ let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
52
+ if brahmic_match != None {
53
+ let brahmic_match = brahmic_match.unwrap();
54
+ let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as usize;
55
+
56
+ if brahmic_codepoint < 0x0980 {
57
+ return 1; // Devanagari
58
+ } else if brahmic_codepoint < 0x0A00 {
59
+ return 2; // Bengali
60
+ } else if brahmic_codepoint < 0x0A80 {
61
+ return 3; // Gurmukhi
62
+ } else if brahmic_codepoint < 0x0B00 {
63
+ return 4; // Gujarati
64
+ } else if brahmic_codepoint < 0x0B80 {
65
+ return 5; // Oriya
66
+ } else if brahmic_codepoint < 0x0C00 {
67
+ return 6; // Tamil
68
+ } else if brahmic_codepoint < 0x0C80 {
69
+ return 7; // Telugu
70
+ } else if brahmic_codepoint < 0x0D00 {
71
+ return 8; // Kannada
72
+ } else {
73
+ return 9; // Malayalam
74
+ }
75
+ }
76
+
77
+ // Romanizations
78
+ if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
79
+ if RE_KOLKATA_ONLY.is_match(r_str) {
80
+ return 11; // Kolkata
81
+ } else {
82
+ return 10; // IAST
83
+ }
84
+ } else if RE_ITRANS_ONLY.is_match(r_str) {
85
+ return 12; // ITRANS
86
+ } else if RE_SLP1_ONLY.is_match(r_str) {
87
+ return 13; // SLP1
88
+ } else if RE_VELTHUIS_ONLY.is_match(r_str) {
89
+ return 14; // Velthuis
90
+ } else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
91
+ return 12; // ITRANS
92
+ } else if RE_HARVARD_KYOTO.is_match(r_str) {
93
+ return 15; // HK
94
+ }
95
+ return 0; // Unknown
96
+ }
@@ -0,0 +1,42 @@
1
+ use detect;
2
+
3
+ use rb;
4
+ use rb::{CallbackPtr, Value};
5
+
6
+ // Initialize all of the Ruby-specific static variables.
7
+ lazy_static! {
8
+ // Lookup table for Ruby Results
9
+ static ref RUBY_RESULTS: [Value; 16] = [
10
+ rb::RB_NIL, // 0
11
+ rb::str_to_sym("devanagari"), // 1
12
+ rb::str_to_sym("bengali") , // 2
13
+ rb::str_to_sym("gurmukhi"), // 3
14
+ rb::str_to_sym("gujarati"), // 4
15
+ rb::str_to_sym("oriya"), // 5
16
+ rb::str_to_sym("tamil"), // 6
17
+ rb::str_to_sym("telugu"), // 7
18
+ rb::str_to_sym("kannada"), // 8
19
+ rb::str_to_sym("malayalam"), // 9
20
+ rb::str_to_sym("iast"), // 10
21
+ rb::str_to_sym("kolkata"), // 11
22
+ rb::str_to_sym("itrans"), // 12
23
+ rb::str_to_sym("slp1"), // 13
24
+ rb::str_to_sym("velthuis"), // 14
25
+ rb::str_to_sym("hk") // 15
26
+ ];
27
+ }
28
+
29
+ fn rbstr_detect_scheme(_rself: Value, s: Value) -> Value {
30
+ let r_str = rb::rbstr_to_str(&s);
31
+ let result = detect::detect_scheme(r_str);
32
+ return RUBY_RESULTS[result];
33
+ }
34
+
35
+ #[no_mangle]
36
+ pub extern fn init_rusty_sanscript() {
37
+ let m_sanscript = rb::define_module("Sanscript");
38
+ let m_detect = rb::define_module_under(&m_sanscript, "Detect");
39
+ let m_rust = rb::define_module_under(&m_detect, "Rust");
40
+ rb::define_method(&m_rust, "rust_detect_scheme",
41
+ rbstr_detect_scheme as CallbackPtr, 1);
42
+ }
data/src/lib.rs CHANGED
@@ -1,122 +1,7 @@
1
1
  #[macro_use] extern crate lazy_static;
2
- #[macro_use] extern crate ruru;
3
- extern crate unicode_normalization;
2
+ extern crate ruby_sys;
4
3
  extern crate regex;
5
4
 
6
- use ruru::{AnyObject, Class, NilClass, Symbol, RString};
7
- use ruru::traits::Object;
8
- #[allow(unused_imports)] use unicode_normalization::UnicodeNormalization;
9
- use regex::Regex;
10
-
11
- methods! {
12
- Class,
13
- _itself,
14
- fn detect(s: RString) -> AnyObject {
15
- lazy_static! {
16
- // # Match escaped control characters
17
- static ref RE_ESCAPED_CONTROL_CHAR: Regex = Regex::new(r"\\(?:\{#|##|#\})").unwrap();
18
-
19
- // # Match ##...## or {#...#} control blocks.
20
- static ref RE_CONTROL_BLOCK: Regex = Regex::new(r"##.*?##|\{#.*?#\}").unwrap();
21
-
22
- // Match any character in the block of Brahmic scripts
23
- // between Devanagari and Malayalam.
24
- static ref RE_BRAHMIC_RANGE: Regex = Regex::new(r"[\x{0900}-\x{0d7f}]").unwrap();
25
-
26
- // Match on special Roman characters
27
- static ref RE_IAST_OR_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]").unwrap();
28
-
29
- // Match on Kolkata-specific Roman characters
30
- static ref RE_KOLKATA_ONLY: Regex = Regex::new(r"(?i)[ēō]").unwrap();
31
-
32
- // Match on ITRANS-only
33
- static ref RE_ITRANS_ONLY: Regex = Regex::new(r"ee|oo|\^[iI]|RR[iI]|L[iI]|~N|N\^|Ch|chh|JN|sh|Sh|\.a").unwrap();
34
-
35
- // Match on SLP1-only characters and bigrams
36
- static ref RE_SLP1_ONLY: Regex = Regex::new(r"[fFxXEOCYwWqQPB]|kz|N[kg]|tT|dD|S[cn]|[aAiIuUeo]R|G[yr]").unwrap();
37
-
38
- // Match on Velthuis-only characters
39
- static ref RE_VELTHUIS_ONLY: Regex = Regex::new(r"\.[mhnrlntds]|\x22n|~s").unwrap();
40
-
41
- // Match on chars shared by ITRANS and Velthuis
42
- static ref RE_ITRANS_OR_VELTHUIS_ONLY: Regex = Regex::new(r"aa|ii|uu|~n").unwrap();
43
-
44
- // Match on characters available in Harvard-Kyoto
45
- static ref RE_HARVARD_KYOTO: Regex = Regex::new(r"[aAiIuUeoRMHkgGcjJTDNtdnpbmyrlvzSsh]").unwrap();
46
-
47
- static ref _DEVANAGARI: Symbol = Symbol::new("devanagari");
48
- static ref _BENGALI: Symbol = Symbol::new("bengali");
49
- static ref _GURMUKHI: Symbol = Symbol::new("gurmukhi");
50
- static ref _GUJARATI: Symbol = Symbol::new("gujarati");
51
- static ref _ORIYA: Symbol = Symbol::new("oriya");
52
- static ref _TAMIL: Symbol = Symbol::new("tamil");
53
- static ref _TELUGU: Symbol = Symbol::new("telugu");
54
- static ref _KANNADA: Symbol = Symbol::new("kannada");
55
- static ref _MALAYALAM: Symbol = Symbol::new("malayalam");
56
- static ref _IAST: Symbol = Symbol::new("iast");
57
- static ref _KOLKATA: Symbol = Symbol::new("kolkata");
58
- static ref _ITRANS: Symbol = Symbol::new("itrans");
59
- static ref _SLP1: Symbol = Symbol::new("slp1");
60
- static ref _VELTHUIS: Symbol = Symbol::new("velthuis");
61
- static ref _HK: Symbol = Symbol::new("hk");
62
- static ref _NIL: NilClass = NilClass::new();
63
- }
64
-
65
- let r_replaced_str = &RE_ESCAPED_CONTROL_CHAR.replace_all(&s.to_string(), "");
66
- let r_str = &RE_CONTROL_BLOCK.replace_all(r_replaced_str, "");
67
-
68
- // Brahmic schemes are all within a specific range of code points.
69
- let brahmic_match = RE_BRAHMIC_RANGE.find(r_str);
70
- if brahmic_match != None {
71
- let brahmic_match = brahmic_match.unwrap();
72
- let brahmic_codepoint = r_str.chars().nth(brahmic_match.0).unwrap() as u32;
73
-
74
- if brahmic_codepoint < 0x0980 {
75
- return _DEVANAGARI.to_any_object();
76
- } else if brahmic_codepoint < 0x0A00 {
77
- return _BENGALI.to_any_object();
78
- } else if brahmic_codepoint < 0x0A80 {
79
- return _GURMUKHI.to_any_object();
80
- } else if brahmic_codepoint < 0x0B00 {
81
- return _GUJARATI.to_any_object();
82
- } else if brahmic_codepoint < 0x0B80 {
83
- return _ORIYA.to_any_object();
84
- } else if brahmic_codepoint < 0x0C00 {
85
- return _TAMIL.to_any_object();
86
- } else if brahmic_codepoint < 0x0C80 {
87
- return _TELUGU.to_any_object();
88
- } else if brahmic_codepoint < 0x0D00 {
89
- return _KANNADA.to_any_object();
90
- } else {
91
- return _MALAYALAM.to_any_object();
92
- }
93
- }
94
-
95
- // Romanizations
96
- if RE_IAST_OR_KOLKATA_ONLY.is_match(r_str) {
97
- if RE_KOLKATA_ONLY.is_match(r_str) {
98
- return _KOLKATA.to_any_object();
99
- } else {
100
- return _IAST.to_any_object();
101
- }
102
- } else if RE_ITRANS_ONLY.is_match(r_str) {
103
- return _ITRANS.to_any_object();
104
- } else if RE_SLP1_ONLY.is_match(r_str) {
105
- return _SLP1.to_any_object();
106
- } else if RE_VELTHUIS_ONLY.is_match(r_str) {
107
- return _VELTHUIS.to_any_object();
108
- } else if RE_ITRANS_OR_VELTHUIS_ONLY.is_match(r_str) {
109
- return _ITRANS.to_any_object();
110
- } else if RE_HARVARD_KYOTO.is_match(r_str) {
111
- return _HK.to_any_object();
112
- }
113
- return _NIL.to_any_object();
114
- }
115
- }
116
-
117
- #[no_mangle]
118
- pub extern fn init_rusty_sanscript() {
119
- Class::from_existing("RustySanscriptDetect").define(|itself| {
120
- itself.def("rust_detect_scheme", detect);
121
- });
122
- }
5
+ mod rb;
6
+ // Exports a Sanscript::Detect::Rust module
7
+ pub mod detect;
@@ -0,0 +1,48 @@
1
+ use std::ffi::{CStr, CString};
2
+
3
+ use ruby_sys::{class, string, symbol, util};
4
+ use ruby_sys::types::c_char;
5
+ use ruby_sys::value::RubySpecialConsts::Nil;
6
+
7
+ pub use ruby_sys::types::{CallbackPtr, Value};
8
+ pub const RB_NIL: Value = Value { value: Nil as usize };
9
+
10
+ //
11
+ // Helper functions for dealing with Ruby and CStrings
12
+ //
13
+
14
+ #[inline(always)]
15
+ fn str_to_cstrp(s: &str) -> *const c_char {
16
+ CString::new(s).unwrap().as_ptr()
17
+ }
18
+
19
+ #[inline]
20
+ pub fn rbstr_to_str<'a>(s: *const Value) -> &'a str {
21
+ unsafe {
22
+ let c_strp = string::rb_string_value_cstr(s);
23
+ CStr::from_ptr(c_strp).to_str().unwrap()
24
+ }
25
+ }
26
+
27
+ pub fn str_to_sym(s: &str) -> Value {
28
+ let c_strp = str_to_cstrp(s);
29
+ unsafe {
30
+ let id = util::rb_intern(c_strp);
31
+ symbol::rb_id2sym(id)
32
+ }
33
+ }
34
+
35
+ pub fn define_module(name: &str) -> Value {
36
+ let c_strp = str_to_cstrp(name);
37
+ unsafe { class::rb_define_module(c_strp) }
38
+ }
39
+
40
+ pub fn define_module_under(parent: &Value, name: &str) -> Value {
41
+ let c_strp = str_to_cstrp(name);
42
+ unsafe { class::rb_define_module_under(*parent, c_strp) }
43
+ }
44
+
45
+ pub fn define_method(module: &Value, name: &str, method: CallbackPtr, argc: i32) {
46
+ let c_strp = str_to_cstrp(name);
47
+ unsafe { class::rb_define_method(*module, c_strp, method, argc) }
48
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanscript
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Bellefleur
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-08-16 00:00:00.000000000 Z
11
+ date: 2016-09-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.12'
19
+ version: '1.13'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.12'
26
+ version: '1.13'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '11.2'
33
+ version: '11.3'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '11.2'
40
+ version: '11.3'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -72,28 +72,28 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0.41'
75
+ version: '0.43'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0.41'
82
+ version: '0.43'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: rubocop-rspec
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '1.5'
89
+ version: '1.7'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '1.5'
96
+ version: '1.7'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: pry
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -114,14 +114,14 @@ dependencies:
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: '2.6'
117
+ version: '2.7'
118
118
  type: :development
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: '2.6'
124
+ version: '2.7'
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: yard
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -156,14 +156,14 @@ dependencies:
156
156
  requirements:
157
157
  - - "~>"
158
158
  - !ruby/object:Gem::Version
159
- version: '0.5'
159
+ version: '0.7'
160
160
  type: :runtime
161
161
  prerelease: false
162
162
  version_requirements: !ruby/object:Gem::Requirement
163
163
  requirements:
164
164
  - - "~>"
165
165
  - !ruby/object:Gem::Version
166
- version: '0.5'
166
+ version: '0.7'
167
167
  description:
168
168
  email:
169
169
  - nomoon@phoebus.ca
@@ -178,7 +178,6 @@ files:
178
178
  - ".rubocop.yml"
179
179
  - ".travis.yml"
180
180
  - CODE_OF_CONDUCT.md
181
- - Cargo.lock
182
181
  - Cargo.toml
183
182
  - Gemfile
184
183
  - LICENSE.txt
@@ -192,11 +191,15 @@ files:
192
191
  - lib/sanscript/detect/ruby24.rb
193
192
  - lib/sanscript/detect/ruby2x.rb
194
193
  - lib/sanscript/exceptions.rb
194
+ - lib/sanscript/rust.rb
195
195
  - lib/sanscript/transliterate.rb
196
196
  - lib/sanscript/transliterate/schemes.rb
197
197
  - lib/sanscript/version.rb
198
198
  - sanscript.gemspec
199
+ - src/detect/mod.rs
200
+ - src/detect/ruby.rs
199
201
  - src/lib.rs
202
+ - src/rb.rs
200
203
  homepage: https://github.com/ubcsanskrit/sanscript.rb
201
204
  licenses:
202
205
  - MIT
data/Cargo.lock DELETED
@@ -1,115 +0,0 @@
1
- [root]
2
- name = "rusty_sanscript"
3
- version = "0.2.0"
4
- dependencies = [
5
- "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
6
- "regex 0.1.73 (registry+https://github.com/rust-lang/crates.io-index)",
7
- "ruru 0.7.8 (registry+https://github.com/rust-lang/crates.io-index)",
8
- "unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
9
- ]
10
-
11
- [[package]]
12
- name = "aho-corasick"
13
- version = "0.5.2"
14
- source = "registry+https://github.com/rust-lang/crates.io-index"
15
- dependencies = [
16
- "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
17
- ]
18
-
19
- [[package]]
20
- name = "kernel32-sys"
21
- version = "0.2.2"
22
- source = "registry+https://github.com/rust-lang/crates.io-index"
23
- dependencies = [
24
- "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
25
- "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
26
- ]
27
-
28
- [[package]]
29
- name = "lazy_static"
30
- version = "0.2.1"
31
- source = "registry+https://github.com/rust-lang/crates.io-index"
32
-
33
- [[package]]
34
- name = "libc"
35
- version = "0.2.15"
36
- source = "registry+https://github.com/rust-lang/crates.io-index"
37
-
38
- [[package]]
39
- name = "memchr"
40
- version = "0.1.11"
41
- source = "registry+https://github.com/rust-lang/crates.io-index"
42
- dependencies = [
43
- "libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
44
- ]
45
-
46
- [[package]]
47
- name = "regex"
48
- version = "0.1.73"
49
- source = "registry+https://github.com/rust-lang/crates.io-index"
50
- dependencies = [
51
- "aho-corasick 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
52
- "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
53
- "regex-syntax 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
54
- "thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
55
- "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
56
- ]
57
-
58
- [[package]]
59
- name = "regex-syntax"
60
- version = "0.3.4"
61
- source = "registry+https://github.com/rust-lang/crates.io-index"
62
-
63
- [[package]]
64
- name = "ruby-sys"
65
- version = "0.2.6"
66
- source = "registry+https://github.com/rust-lang/crates.io-index"
67
- dependencies = [
68
- "libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
69
- ]
70
-
71
- [[package]]
72
- name = "ruru"
73
- version = "0.7.8"
74
- source = "registry+https://github.com/rust-lang/crates.io-index"
75
- dependencies = [
76
- "ruby-sys 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
77
- ]
78
-
79
- [[package]]
80
- name = "thread-id"
81
- version = "2.0.0"
82
- source = "registry+https://github.com/rust-lang/crates.io-index"
83
- dependencies = [
84
- "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
85
- "libc 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)",
86
- ]
87
-
88
- [[package]]
89
- name = "thread_local"
90
- version = "0.2.6"
91
- source = "registry+https://github.com/rust-lang/crates.io-index"
92
- dependencies = [
93
- "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
94
- ]
95
-
96
- [[package]]
97
- name = "unicode-normalization"
98
- version = "0.1.2"
99
- source = "registry+https://github.com/rust-lang/crates.io-index"
100
-
101
- [[package]]
102
- name = "utf8-ranges"
103
- version = "0.1.3"
104
- source = "registry+https://github.com/rust-lang/crates.io-index"
105
-
106
- [[package]]
107
- name = "winapi"
108
- version = "0.2.8"
109
- source = "registry+https://github.com/rust-lang/crates.io-index"
110
-
111
- [[package]]
112
- name = "winapi-build"
113
- version = "0.1.1"
114
- source = "registry+https://github.com/rust-lang/crates.io-index"
115
-