stemmers 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +13 -0
  3. data/CHANGELOG.md +5 -0
  4. data/CODE_OF_CONDUCT.md +132 -0
  5. data/Cargo.lock +547 -0
  6. data/Cargo.toml +7 -0
  7. data/LICENSE.txt +21 -0
  8. data/README.md +113 -0
  9. data/Rakefile +23 -0
  10. data/ext/stemmers/Cargo.toml +16 -0
  11. data/ext/stemmers/extconf.rb +6 -0
  12. data/ext/stemmers/src/lib.rs +105 -0
  13. data/lib/stemmers/stopwords/af.json +53 -0
  14. data/lib/stemmers/stopwords/ar.json +482 -0
  15. data/lib/stemmers/stopwords/bg.json +261 -0
  16. data/lib/stemmers/stopwords/bn.json +400 -0
  17. data/lib/stemmers/stopwords/br.json +1205 -0
  18. data/lib/stemmers/stopwords/ca.json +280 -0
  19. data/lib/stemmers/stopwords/cs.json +425 -0
  20. data/lib/stemmers/stopwords/da.json +172 -0
  21. data/lib/stemmers/stopwords/de.json +622 -0
  22. data/lib/stemmers/stopwords/el.json +849 -0
  23. data/lib/stemmers/stopwords/en.json +1300 -0
  24. data/lib/stemmers/stopwords/eo.json +175 -0
  25. data/lib/stemmers/stopwords/es.json +734 -0
  26. data/lib/stemmers/stopwords/et.json +37 -0
  27. data/lib/stemmers/stopwords/eu.json +100 -0
  28. data/lib/stemmers/stopwords/fa.json +801 -0
  29. data/lib/stemmers/stopwords/fi.json +849 -0
  30. data/lib/stemmers/stopwords/fr.json +693 -0
  31. data/lib/stemmers/stopwords/ga.json +111 -0
  32. data/lib/stemmers/stopwords/gl.json +162 -0
  33. data/lib/stemmers/stopwords/gu.json +226 -0
  34. data/lib/stemmers/stopwords/ha.json +41 -0
  35. data/lib/stemmers/stopwords/he.json +196 -0
  36. data/lib/stemmers/stopwords/hi.json +227 -0
  37. data/lib/stemmers/stopwords/hr.json +181 -0
  38. data/lib/stemmers/stopwords/hu.json +791 -0
  39. data/lib/stemmers/stopwords/hy.json +47 -0
  40. data/lib/stemmers/stopwords/id.json +760 -0
  41. data/lib/stemmers/stopwords/it.json +634 -0
  42. data/lib/stemmers/stopwords/ja.json +136 -0
  43. data/lib/stemmers/stopwords/ko.json +681 -0
  44. data/lib/stemmers/stopwords/ku.json +64 -0
  45. data/lib/stemmers/stopwords/la.json +51 -0
  46. data/lib/stemmers/stopwords/lt.json +476 -0
  47. data/lib/stemmers/stopwords/lv.json +163 -0
  48. data/lib/stemmers/stopwords/mr.json +101 -0
  49. data/lib/stemmers/stopwords/ms.json +477 -0
  50. data/lib/stemmers/stopwords/nl.json +415 -0
  51. data/lib/stemmers/stopwords/no.json +223 -0
  52. data/lib/stemmers/stopwords/pl.json +331 -0
  53. data/lib/stemmers/stopwords/pt.json +562 -0
  54. data/lib/stemmers/stopwords/ro.json +436 -0
  55. data/lib/stemmers/stopwords/ru.json +561 -0
  56. data/lib/stemmers/stopwords/sk.json +420 -0
  57. data/lib/stemmers/stopwords/sl.json +448 -0
  58. data/lib/stemmers/stopwords/so.json +32 -0
  59. data/lib/stemmers/stopwords/st.json +33 -0
  60. data/lib/stemmers/stopwords/sv.json +420 -0
  61. data/lib/stemmers/stopwords/sw.json +76 -0
  62. data/lib/stemmers/stopwords/th.json +118 -0
  63. data/lib/stemmers/stopwords/tl.json +149 -0
  64. data/lib/stemmers/stopwords/tr.json +506 -0
  65. data/lib/stemmers/stopwords/uk.json +75 -0
  66. data/lib/stemmers/stopwords/ur.json +519 -0
  67. data/lib/stemmers/stopwords/vi.json +647 -0
  68. data/lib/stemmers/stopwords/yo.json +62 -0
  69. data/lib/stemmers/stopwords/zh.json +796 -0
  70. data/lib/stemmers/stopwords/zu.json +31 -0
  71. data/lib/stemmers/version.rb +5 -0
  72. data/lib/stemmers.rb +91 -0
  73. data/sig/stemmers.rbs +4 -0
  74. metadata +131 -0
data/Cargo.toml ADDED
@@ -0,0 +1,7 @@
1
+ # This Cargo.toml is here to let externals tools (IDEs, etc.) know that this is
2
+ # a Rust project. Your extensions dependencies should be added to the Cargo.toml
3
+ # in the ext/ directory.
4
+
5
+ [workspace]
6
+ members = ["./ext/stemmers"]
7
+ resolver = "2"
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2025 Nando Vieira
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,113 @@
1
+ # Stemmers
2
+
3
+ Rust bindings for https://whatlang.org and
4
+ https://github.com/testuj-to/tantivy-stemmers for language detection and
5
+ stemming.
6
+
7
+ ## Installation
8
+
9
+ Install the gem and add to the application's Gemfile by executing:
10
+
11
+ ```bash
12
+ bundle add stemmers
13
+ ```
14
+
15
+ If bundler is not being used to manage dependencies, install the gem by
16
+ executing:
17
+
18
+ ```bash
19
+ gem install stemmers
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ The language detection works in the context of the supported stemmers. If
25
+ language doesn't have a stemmer, then it'll return `nil`.
26
+
27
+ ```ruby
28
+ require "stemmers"
29
+
30
+ Stemmers.detect_language("Hello there!")
31
+ #=> "en"
32
+
33
+ Stemmers.detect_language("Olá, mundo!")
34
+ #=> "pt"
35
+ ```
36
+
37
+ To stem a word, you can use the `Stemmers.stem_word(word, **options)` method.
38
+
39
+ ```ruby
40
+ require "stemmers"
41
+
42
+ Stemmers.stem_word("running", language: "en")
43
+ #=> "run"
44
+
45
+ Stemmers.stem_word("correndo", language: "pt")
46
+ #=> "corr"
47
+ ```
48
+
49
+ You have a few options when stemming a word with
50
+ `Stemmers.stem_word(input, **options)`:
51
+
52
+ - `language`: The language to use for stemming. If not provided, it will try to
53
+ detect the language.
54
+ - `normalize`: If set to `true`, it will normalize the word after stemming. This
55
+ is useful for languages that have diacritics or special characters.
56
+ - `lowercase`: If set to `true`, it will lowercase the word before stemming
57
+ (stemming requires lowercase strings, but this is not done automatically to
58
+ avoid unnecessary transformations when using `Stemmers.stem(phrase)`.
59
+
60
+ To stem a phrase, you can use `Stemmers.stem(input, **options)`.
61
+
62
+ ```ruby
63
+ require "stemmers"
64
+
65
+ Stemmers.stem("Testing this phrase", language: "en")
66
+ #=> ["test", "this", "phrase"]
67
+ ```
68
+
69
+ The `Stemmers.stem(input, **options)` method has the following options:
70
+
71
+ - `language`: The language to use for stemming. If not provided, it will try to
72
+ detect the language.
73
+ - `normalize`: If set to `true`, it will normalize the word after stemming. This
74
+ is useful for languages that have diacritics or special characters.
75
+ - `lowercase`: If set to `true`, it will lowercase the word before stemming
76
+ (stemming requires lowercase strings, but this is not done automatically to
77
+ avoid unnecessary transformations when using `Stemmers.stem(phrase)`).
78
+ - `clean`: If set to `true`, it will remove stop words from the phrase (beware
79
+ that you may end up with an empty array). It uses the list of stop words from
80
+ <https://github.com/stopwords-iso/stopwords-iso> (it's not a great list—it has
81
+ too much surprising words that shouldn't be in the list, but I couldn't find
82
+ anything better).
83
+
84
+ ## Development
85
+
86
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run
87
+ `rake test` to run the tests. You can also run `bin/console` for an interactive
88
+ prompt that will allow you to experiment.
89
+
90
+ To install this gem onto your local machine, run `bundle exec rake install`. To
91
+ release a new version, update the version number in `version.rb`, and then run
92
+ `bundle exec rake release`, which will create a git tag for the version, push
93
+ git commits and the created tag, and push the `.gem` file to
94
+ [rubygems.org](https://rubygems.org).
95
+
96
+ ## Contributing
97
+
98
+ Bug reports and pull requests are welcome on GitHub at
99
+ https://github.com/fnando/stemmers. This project is intended to be a safe,
100
+ welcoming space for collaboration, and contributors are expected to adhere to
101
+ the
102
+ [code of conduct](https://github.com/fnando/stemmers/blob/main/CODE_OF_CONDUCT.md).
103
+
104
+ ## License
105
+
106
+ The gem is available as open source under the terms of the
107
+ [MIT License](https://opensource.org/licenses/MIT).
108
+
109
+ ## Code of Conduct
110
+
111
+ Everyone interacting in the Stemmers project's codebases, issue trackers, chat
112
+ rooms and mailing lists is expected to follow the
113
+ [code of conduct](https://github.com/fnando/stemmers/blob/main/CODE_OF_CONDUCT.md).
data/Rakefile ADDED
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "minitest/test_task"
5
+
6
+ Minitest::TestTask.create
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ require "rb_sys/extensiontask"
13
+
14
+ desc "Compile the extension"
15
+ task build: :compile
16
+
17
+ GEMSPEC = Gem::Specification.load("stemmers.gemspec")
18
+
19
+ RbSys::ExtensionTask.new("stemmers", GEMSPEC) do |ext|
20
+ ext.lib_dir = "lib/stemmers"
21
+ end
22
+
23
+ task default: %i[compile test rubocop]
@@ -0,0 +1,16 @@
1
+ [package]
2
+ name = "stemmers"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ authors = ["Nando Vieira <me@fnando.com>"]
6
+ license = "MIT"
7
+ publish = false
8
+
9
+ [lib]
10
+ crate-type = ["cdylib"]
11
+
12
+ [dependencies]
13
+ magnus = { version = "0.6.2" }
14
+ rust-stemmers = "1.2.0"
15
+ tantivy-stemmers = { version = "0.4.0", features = ["arabic", "armenian_mkrtchyan", "catalan", "czech_dolamic_light", "danish", "dutch", "estonian_freienthal", "finnish", "french", "german", "greek", "hindi_lightweight", "hungarian", "indonesian_tala", "italian", "lithuanian_jocas", "nepali", "norwegian_bokmal", "polish_yarovoy_unaccented", "portuguese", "romanian", "russian", "spanish", "swedish", "turkish_cilden", "yiddish_urieli"] }
16
+ whatlang = "0.16.4"
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+ require "rb_sys/mkmf"
5
+
6
+ create_rust_makefile("stemmers/stemmers")
@@ -0,0 +1,105 @@
1
+ use magnus::{exception, function, prelude::*, Error, Ruby};
2
+ use tantivy_stemmers::algorithms::{
3
+ arabic, armenian_mkrtchyan, catalan, czech_dolamic_light, danish, dutch, english_porter_2,
4
+ estonian_freienthal, finnish, french, german, greek, hindi_lightweight, hungarian,
5
+ indonesian_tala, italian, lithuanian_jocas, nepali, norwegian_bokmal,
6
+ polish_yarovoy_unaccented, portuguese, romanian, russian, spanish, swedish, turkish_cilden,
7
+ yiddish_urieli,
8
+ };
9
+ use whatlang::{detect, Lang};
10
+
11
+ fn lang_to_code(lang: Lang) -> Option<String> {
12
+ match lang {
13
+ Lang::Ara => Some("ar".into()),
14
+ Lang::Cat => Some("ca".into()),
15
+ Lang::Ces => Some("cs".into()),
16
+ Lang::Dan => Some("da".into()),
17
+ Lang::Deu => Some("de".into()),
18
+ Lang::Ell => Some("el".into()),
19
+ Lang::Eng => Some("en".into()),
20
+ Lang::Est => Some("et".into()),
21
+ Lang::Fin => Some("fi".into()),
22
+ Lang::Fra => Some("fr".into()),
23
+ Lang::Hin => Some("hi".into()),
24
+ Lang::Hun => Some("hu".into()),
25
+ Lang::Hye => Some("hy".into()),
26
+ Lang::Ind => Some("id".into()),
27
+ Lang::Ita => Some("it".into()),
28
+ Lang::Lit => Some("lt".into()),
29
+ Lang::Nep => Some("ne".into()),
30
+ Lang::Nld => Some("nl".into()),
31
+ Lang::Nob => Some("no".into()),
32
+ Lang::Pol => Some("pl".into()),
33
+ Lang::Por => Some("pt".into()),
34
+ Lang::Ron => Some("ro".into()),
35
+ Lang::Rus => Some("ru".into()),
36
+ Lang::Spa => Some("es".into()),
37
+ Lang::Swe => Some("sv".into()),
38
+ Lang::Tam => Some("ta".into()),
39
+ Lang::Tur => Some("tr".into()),
40
+ Lang::Yid => Some("yi".into()),
41
+ _ => None,
42
+ }
43
+ }
44
+
45
+ fn detect_language(text: String) -> Option<String> {
46
+ let Some(info) = detect(&text) else {
47
+ return None;
48
+ };
49
+
50
+ lang_to_code(info.lang())
51
+ }
52
+
53
+ fn stem_word(word: String, language: String) -> Result<String, Error> {
54
+ match language.as_str() {
55
+ "ar" => Ok(arabic(&word).to_string()),
56
+ "ca" => Ok(catalan(&word).to_string()),
57
+ "cs" => Ok(czech_dolamic_light(&word).to_string()),
58
+ "da" => Ok(danish(&word).to_string()),
59
+ "de" => Ok(german(&word).to_string()),
60
+ "el" => Ok(greek(&word).to_string()),
61
+ "en" => Ok(english_porter_2(&word).to_string()),
62
+ "es" => Ok(spanish(&word).to_string()),
63
+ "et" => Ok(estonian_freienthal(&word).to_string()),
64
+ "fi" => Ok(finnish(&word).to_string()),
65
+ "fr" => Ok(french(&word).to_string()),
66
+ "hi" => Ok(hindi_lightweight(&word).to_string()),
67
+ "hu" => Ok(hungarian(&word).to_string()),
68
+ "hy" => Ok(armenian_mkrtchyan(&word).to_string()),
69
+ "id" => Ok(indonesian_tala(&word).to_string()),
70
+ "it" => Ok(italian(&word).to_string()),
71
+ "lt" => Ok(lithuanian_jocas(&word).to_string()),
72
+ "no" => Ok(norwegian_bokmal(&word).to_string()),
73
+ "ne" => Ok(nepali(&word).to_string()),
74
+ "nl" => Ok(dutch(&word).to_string()),
75
+ "pl" => Ok(polish_yarovoy_unaccented(&word).to_string()),
76
+ "pt" => Ok(portuguese(&word).to_string()),
77
+ "ro" => Ok(romanian(&word).to_string()),
78
+ "ru" => Ok(russian(&word).to_string()),
79
+ "sv" => Ok(swedish(&word).to_string()),
80
+ "tr" => Ok(turkish_cilden(&word).to_string()),
81
+ "yi" => Ok(yiddish_urieli(&word).to_string()),
82
+ _ => Err(Error::new(
83
+ exception::arg_error(),
84
+ format!("Unsupported language: {language}"),
85
+ )),
86
+ }
87
+ }
88
+
89
+ fn is_supported_language(language: String) -> bool {
90
+ [
91
+ "ar", "ca", "cs", "da", "de", "el", "en", "es", "et", "fi", "fr", "hi", "hu", "hy", "id",
92
+ "it", "lt", "no", "ne", "nl", "pl", "pt", "ro", "ru", "sv", "tr", "yi",
93
+ ]
94
+ .contains(&language.as_str())
95
+ }
96
+
97
+ #[magnus::init]
98
+ fn init(ruby: &Ruby) -> Result<(), Error> {
99
+ let root = ruby.define_module("Stemmers")?;
100
+ let module = root.define_module("Bindings")?;
101
+ module.define_singleton_method("stem_word", function!(stem_word, 2))?;
102
+ module.define_singleton_method("detect_language", function!(detect_language, 1))?;
103
+ module.define_singleton_method("supported_language?", function!(is_supported_language, 1))?;
104
+ Ok(())
105
+ }
@@ -0,0 +1,53 @@
1
+ [
2
+ "'n",
3
+ "aan",
4
+ "af",
5
+ "al",
6
+ "as",
7
+ "baie",
8
+ "by",
9
+ "daar",
10
+ "dag",
11
+ "dat",
12
+ "die",
13
+ "dit",
14
+ "een",
15
+ "ek",
16
+ "en",
17
+ "gaan",
18
+ "gesê",
19
+ "haar",
20
+ "het",
21
+ "hom",
22
+ "hulle",
23
+ "hy",
24
+ "in",
25
+ "is",
26
+ "jou",
27
+ "jy",
28
+ "kan",
29
+ "kom",
30
+ "ma",
31
+ "maar",
32
+ "met",
33
+ "my",
34
+ "na",
35
+ "nie",
36
+ "om",
37
+ "ons",
38
+ "op",
39
+ "saam",
40
+ "sal",
41
+ "se",
42
+ "sien",
43
+ "so",
44
+ "sy",
45
+ "te",
46
+ "toe",
47
+ "uit",
48
+ "van",
49
+ "vir",
50
+ "was",
51
+ "wat",
52
+ "ʼn"
53
+ ]