what_you_say 0.6.7

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml ADDED
@@ -0,0 +1,7 @@
1
+ # This Cargo.toml is here to let externals tools (IDEs, etc.) know that this is
2
+ # a Rust project. Your extensions depedencies should be added to the Cargo.toml
3
+ # in the ext/ directory.
4
+
5
+ [workspace]
6
+ members = ["ext/what_you_say"]
7
+ resolver = "2"
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 Garen J. Torikian
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,83 @@
1
+ # WhatYouSay
2
+
3
+ Quick and easy natural language detection wrapping the [lingua-rs Rust crate](https://github.com/pemistahl/lingua-rs). Instantly identify the source language of a piece of text.
4
+
5
+ ![What you say!!](https://user-images.githubusercontent.com/64050/224237944-ceb2570c-d544-474a-8c91-41433efdee43.png)
6
+
7
+ - Supports [75+ languages](https://github.com/pemistahl/lingua-rs/tree/main#3-which-languages-are-supported)
8
+ - Core library is written in Rust; this is a Ruby wrapper to it
9
+ - Lightweight, fast, and simple
10
+
11
+ ## Installation
12
+
13
+ Install the gem and add to the application's Gemfile by executing:
14
+
15
+ $ bundle add what_you_say
16
+
17
+ If bundler is not being used to manage dependencies, install the gem by executing:
18
+
19
+ $ gem install what_you_say
20
+
21
+ ## Usage
22
+
23
+ The method to call is `detect_language`.
24
+
25
+ Pass in the text whose language you want to detect:
26
+
27
+ ```ruby
28
+ require "what_you_say"
29
+
30
+ text = "Ĉu vi ne volas eklerni Esperanton? Bonvolu! Estas unu de la plej bonaj aferoj!"
31
+
32
+ result = WhatYouSay.new.detect_language(text)
33
+
34
+ assert_equal("epo", result.lang.code)
35
+ assert_equal("Esperanto", result.lang.eng_name)
36
+ ```
37
+
38
+ You also have to opportunity to `inspect` some output:
39
+
40
+ ```ruby
41
+ text = "Եվ ահա ես ստանում եմ մի զանգ պատահական տղայից"
42
+ WhatYouSay.new.detect_language(text).inspect
43
+ #=> #<WhatYouSay::Lang code="hye" eng_name="armenian">
44
+ ```
45
+
46
+ Not everything in life is perfect, and neither is this lib. Sometimes language detection will be wildly mistaken. You
47
+ can attempt to correct this by passing in an `allowlist` of supported languages:
48
+
49
+ ```ruby
50
+ text = "สวัสดี Rágis hello"
51
+ result = WhatYouSay.new.detect_language(text)
52
+
53
+ assert_equal("spanish", result.eng_name)
54
+
55
+ result = WhatYouSay.new(allowlist: ["English", "Thai"]).detect_language(text)
56
+
57
+ assert_equal("eng", result.code)
58
+ ```
59
+
60
+ If a language truly cannot be detected, the `Unknown` language type is returned:
61
+
62
+ ```ruby
63
+ text = "日本語"
64
+
65
+ result = WhatYouSay.new(allowlist: ["English", "Thai"]).detect_language(text)
66
+
67
+ assert_equal("???", result.code)
68
+ assert_equal("unknown", result.eng_name)
69
+ ```
70
+
71
+ ## Development
72
+
73
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake compile test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
74
+
75
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and merge that change into `main`.
76
+
77
+ ## Contributing
78
+
79
+ Bug reports and pull requests are welcome on GitHub at https://github.com/gjtorikian/what_you_say.
80
+
81
+ ## License
82
+
83
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,14 @@
1
+ [package]
2
+ name = "what_you_say"
3
+ version = "1.0.0"
4
+ edition = "2021"
5
+ rust-version = "1.75.0"
6
+ publish = false
7
+
8
+ [dependencies]
9
+ lingua = "1.6"
10
+ magnus = "0.6"
11
+
12
+ [lib]
13
+ name = "what_you_say"
14
+ crate-type = ["cdylib"]
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("what_you_say/what_you_say")
@@ -0,0 +1,46 @@
1
+ use lingua::Language;
2
+
3
+ #[magnus::wrap(class = "WhatYouSay::Lang")]
4
+ pub struct WhatYouSayLang {
5
+ code: String,
6
+ eng_name: String,
7
+ }
8
+
9
+ // this is safe as WhatYouSayLang does not contain any Ruby types
10
+ unsafe impl magnus::IntoValueFromNative for WhatYouSayLang {}
11
+
12
+ impl WhatYouSayLang {
13
+ pub fn new(lang: Option<Language>) -> WhatYouSayLang {
14
+ match lang {
15
+ Some(lang) => WhatYouSayLang {
16
+ code: lang.iso_code_639_3().to_string(),
17
+ eng_name: lang.to_string(),
18
+ },
19
+ None => WhatYouSayLang {
20
+ code: "???".to_string(),
21
+ eng_name: "unknown".to_string(),
22
+ },
23
+ }
24
+ }
25
+ pub fn all() -> Vec<WhatYouSayLang> {
26
+ Language::all()
27
+ .into_iter()
28
+ .map(|lang| WhatYouSayLang::new(Some(lang)))
29
+ .collect::<Vec<_>>()
30
+ }
31
+
32
+ pub fn code(&self) -> &str {
33
+ self.code.as_str()
34
+ }
35
+
36
+ pub fn eng_name(&self) -> &str {
37
+ self.eng_name.as_str()
38
+ }
39
+
40
+ pub fn inspect(&self) -> String {
41
+ format!(
42
+ "#<WhatYouSay::Lang code=\"{0}\" eng_name=\"{1}\">",
43
+ self.code, self.eng_name
44
+ )
45
+ }
46
+ }
@@ -0,0 +1,88 @@
1
+ extern crate core;
2
+
3
+ use std::str::FromStr;
4
+
5
+ use lang::WhatYouSayLang;
6
+ use lingua::{Language, LanguageDetector, LanguageDetectorBuilder};
7
+
8
+ use magnus::{
9
+ define_class, exception, function, method, scan_args, Error, Module, Object, RArray, Value,
10
+ };
11
+
12
+ #[magnus::wrap(class = "WhatYouSay")]
13
+ struct WhatYouSay {
14
+ detector: LanguageDetector,
15
+ }
16
+
17
+ impl WhatYouSay {
18
+ fn new(args: &[Value]) -> Result<Self, magnus::Error> {
19
+ let args = scan_args::scan_args::<(), (), (), (), _, ()>(args)?;
20
+
21
+ let kwargs = scan_args::get_kwargs::<_, (), (Option<RArray>,), ()>(
22
+ args.keywords,
23
+ &[],
24
+ &["allowlist"],
25
+ )?;
26
+ let (rb_allowlist,) = kwargs.optional;
27
+
28
+ let mut builder = match rb_allowlist {
29
+ Some(languages) => {
30
+ let mut allowed_languages = vec![];
31
+ for allowed in languages.each() {
32
+ let allowed = match allowed {
33
+ Ok(allowed) => allowed.to_string(),
34
+ Err(_) => {
35
+ return Err(magnus::Error::new(
36
+ exception::runtime_error(),
37
+ format!("{allowed:?}"),
38
+ ))
39
+ }
40
+ };
41
+
42
+ // if !Ok, it maeans the language could not be found
43
+ if let Ok(language) = Language::from_str(&allowed) {
44
+ allowed_languages.push(language)
45
+ }
46
+ }
47
+ LanguageDetectorBuilder::from_languages(&allowed_languages)
48
+ }
49
+ None => LanguageDetectorBuilder::from_all_languages(),
50
+ };
51
+
52
+ // FIXME: this doesn't seem to work when tests are run in parallel
53
+ // builder.with_preloaded_language_models();
54
+
55
+ let detector = builder.build();
56
+
57
+ Ok(WhatYouSay { detector })
58
+ }
59
+
60
+ pub fn detect_text(&self, rb_text: String) -> Result<WhatYouSayLang, magnus::Error> {
61
+ match self.detector.detect_language_of(rb_text) {
62
+ Some(lang) => {
63
+ let result = WhatYouSayLang::new(Some(lang));
64
+
65
+ Ok(result)
66
+ }
67
+ None => Ok(WhatYouSayLang::new(None)),
68
+ }
69
+ }
70
+ }
71
+
72
+ #[magnus::init]
73
+ fn init() -> Result<(), Error> {
74
+ let c_whatyousay = define_class("WhatYouSay", magnus::class::object())?;
75
+
76
+ c_whatyousay.define_singleton_method("new", function!(WhatYouSay::new, -1))?;
77
+ c_whatyousay.define_method("detect_text", method!(WhatYouSay::detect_text, 1))?;
78
+
79
+ let c_lang = c_whatyousay.define_class("Lang", magnus::class::object())?;
80
+ c_lang.define_singleton_method("all", function!(WhatYouSayLang::all, 0))?;
81
+ c_lang.define_method("code", method!(WhatYouSayLang::code, 0))?;
82
+ c_lang.define_method("eng_name", method!(WhatYouSayLang::eng_name, 0))?;
83
+ c_lang.define_method("inspect", method!(WhatYouSayLang::inspect, 0))?;
84
+
85
+ Ok(())
86
+ }
87
+
88
+ pub mod lang;
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ # native precompiled gems package shared libraries in <gem_dir>/lib/what_you_say/<ruby_version>
5
+ # load the precompiled extension file
6
+ ruby_version = /\d+\.\d+/.match(RUBY_VERSION)
7
+ require_relative "#{ruby_version}/what_you_say"
8
+ rescue LoadError
9
+ # fall back to the extension compiled upon installation.
10
+ # use "require" instead of "require_relative" because non-native gems will place C extension files
11
+ # in Gem::BasicSpecification#extension_dir after compilation (during normal installation), which
12
+ # is in $LOAD_PATH but not necessarily relative to this file (see nokogiri#2300)
13
+ require "what_you_say/what_you_say"
14
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ class WhatYouSay
4
+ class Lang
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class WhatYouSay
4
+ VERSION = "0.6.7"
5
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "what_you_say/extension"
4
+
5
+ require "what_you_say/lang"
6
+ require "what_you_say/version"
7
+
8
+ if ENV.fetch("DEBUG", false)
9
+ require "debug"
10
+ end
11
+
12
+ class WhatYouSay
13
+ def detect_language(text)
14
+ raise TypeError, "text must be a String; got a #{text.class}!" unless text.is_a?(String)
15
+ raise TypeError, "text must be UTF-8 encoded; got #{text.encoding}!" unless text.encoding.name == "UTF-8"
16
+
17
+ detect_text(text)
18
+ end
19
+ end
metadata ADDED
@@ -0,0 +1,105 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: what_you_say
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.6.7
5
+ platform: ruby
6
+ authors:
7
+ - Garen J. Torikian
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-01-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ force_ruby_platform: false
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '13.0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '13.0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: rake-compiler
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '1.2'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '1.2'
56
+ description: Natural language detection with a focus on simplicity and performance.
57
+ Currently wraps the lingua-rs Rust crate.
58
+ email:
59
+ - gjtorikian@users.noreply.github.com
60
+ executables: []
61
+ extensions:
62
+ - ext/what_you_say/extconf.rb
63
+ extra_rdoc_files: []
64
+ files:
65
+ - Cargo.lock
66
+ - Cargo.toml
67
+ - LICENSE.txt
68
+ - README.md
69
+ - ext/what_you_say/Cargo.toml
70
+ - ext/what_you_say/extconf.rb
71
+ - ext/what_you_say/src/lang.rs
72
+ - ext/what_you_say/src/lib.rs
73
+ - lib/what_you_say.rb
74
+ - lib/what_you_say/extension.rb
75
+ - lib/what_you_say/lang.rb
76
+ - lib/what_you_say/version.rb
77
+ homepage: https://github.com/gjtorikian/what_you_say
78
+ licenses:
79
+ - MIT
80
+ metadata:
81
+ allowed_push_host: https://rubygems.org
82
+ funding_uri: https://github.com/sponsors/gjtorikian/
83
+ source_code_uri: https://github.com/gjtorikian/what_you_say
84
+ rubygems_mfa_required: 'true'
85
+ post_install_message:
86
+ rdoc_options: []
87
+ require_paths:
88
+ - lib
89
+ required_ruby_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: 3.1.0
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: 3.3.22
99
+ requirements: []
100
+ rubygems_version: 3.5.3
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: Fast and lightweight language identification library. Written in Rust, wrapped
104
+ in Ruby.
105
+ test_files: []