stemmers 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +13 -0
  3. data/CHANGELOG.md +5 -0
  4. data/CODE_OF_CONDUCT.md +132 -0
  5. data/Cargo.lock +547 -0
  6. data/Cargo.toml +7 -0
  7. data/LICENSE.txt +21 -0
  8. data/README.md +113 -0
  9. data/Rakefile +23 -0
  10. data/ext/stemmers/Cargo.toml +16 -0
  11. data/ext/stemmers/extconf.rb +6 -0
  12. data/ext/stemmers/src/lib.rs +105 -0
  13. data/lib/stemmers/stopwords/af.json +53 -0
  14. data/lib/stemmers/stopwords/ar.json +482 -0
  15. data/lib/stemmers/stopwords/bg.json +261 -0
  16. data/lib/stemmers/stopwords/bn.json +400 -0
  17. data/lib/stemmers/stopwords/br.json +1205 -0
  18. data/lib/stemmers/stopwords/ca.json +280 -0
  19. data/lib/stemmers/stopwords/cs.json +425 -0
  20. data/lib/stemmers/stopwords/da.json +172 -0
  21. data/lib/stemmers/stopwords/de.json +622 -0
  22. data/lib/stemmers/stopwords/el.json +849 -0
  23. data/lib/stemmers/stopwords/en.json +1300 -0
  24. data/lib/stemmers/stopwords/eo.json +175 -0
  25. data/lib/stemmers/stopwords/es.json +734 -0
  26. data/lib/stemmers/stopwords/et.json +37 -0
  27. data/lib/stemmers/stopwords/eu.json +100 -0
  28. data/lib/stemmers/stopwords/fa.json +801 -0
  29. data/lib/stemmers/stopwords/fi.json +849 -0
  30. data/lib/stemmers/stopwords/fr.json +693 -0
  31. data/lib/stemmers/stopwords/ga.json +111 -0
  32. data/lib/stemmers/stopwords/gl.json +162 -0
  33. data/lib/stemmers/stopwords/gu.json +226 -0
  34. data/lib/stemmers/stopwords/ha.json +41 -0
  35. data/lib/stemmers/stopwords/he.json +196 -0
  36. data/lib/stemmers/stopwords/hi.json +227 -0
  37. data/lib/stemmers/stopwords/hr.json +181 -0
  38. data/lib/stemmers/stopwords/hu.json +791 -0
  39. data/lib/stemmers/stopwords/hy.json +47 -0
  40. data/lib/stemmers/stopwords/id.json +760 -0
  41. data/lib/stemmers/stopwords/it.json +634 -0
  42. data/lib/stemmers/stopwords/ja.json +136 -0
  43. data/lib/stemmers/stopwords/ko.json +681 -0
  44. data/lib/stemmers/stopwords/ku.json +64 -0
  45. data/lib/stemmers/stopwords/la.json +51 -0
  46. data/lib/stemmers/stopwords/lt.json +476 -0
  47. data/lib/stemmers/stopwords/lv.json +163 -0
  48. data/lib/stemmers/stopwords/mr.json +101 -0
  49. data/lib/stemmers/stopwords/ms.json +477 -0
  50. data/lib/stemmers/stopwords/nl.json +415 -0
  51. data/lib/stemmers/stopwords/no.json +223 -0
  52. data/lib/stemmers/stopwords/pl.json +331 -0
  53. data/lib/stemmers/stopwords/pt.json +562 -0
  54. data/lib/stemmers/stopwords/ro.json +436 -0
  55. data/lib/stemmers/stopwords/ru.json +561 -0
  56. data/lib/stemmers/stopwords/sk.json +420 -0
  57. data/lib/stemmers/stopwords/sl.json +448 -0
  58. data/lib/stemmers/stopwords/so.json +32 -0
  59. data/lib/stemmers/stopwords/st.json +33 -0
  60. data/lib/stemmers/stopwords/sv.json +420 -0
  61. data/lib/stemmers/stopwords/sw.json +76 -0
  62. data/lib/stemmers/stopwords/th.json +118 -0
  63. data/lib/stemmers/stopwords/tl.json +149 -0
  64. data/lib/stemmers/stopwords/tr.json +506 -0
  65. data/lib/stemmers/stopwords/uk.json +75 -0
  66. data/lib/stemmers/stopwords/ur.json +519 -0
  67. data/lib/stemmers/stopwords/vi.json +647 -0
  68. data/lib/stemmers/stopwords/yo.json +62 -0
  69. data/lib/stemmers/stopwords/zh.json +796 -0
  70. data/lib/stemmers/stopwords/zu.json +31 -0
  71. data/lib/stemmers/version.rb +5 -0
  72. data/lib/stemmers.rb +91 -0
  73. data/sig/stemmers.rbs +4 -0
  74. metadata +131 -0
@@ -0,0 +1,31 @@
1
+ [
2
+ "futhi",
3
+ "kahle",
4
+ "kakhulu",
5
+ "kanye",
6
+ "khona",
7
+ "kodwa",
8
+ "kungani",
9
+ "kusho",
10
+ "la",
11
+ "lakhe",
12
+ "lapho",
13
+ "mina",
14
+ "ngesikhathi",
15
+ "nje",
16
+ "phansi",
17
+ "phezulu",
18
+ "u",
19
+ "ukuba",
20
+ "ukuthi",
21
+ "ukuze",
22
+ "uma",
23
+ "wahamba",
24
+ "wakhe",
25
+ "wami",
26
+ "wase",
27
+ "wathi",
28
+ "yakhe",
29
+ "zakhe",
30
+ "zonke"
31
+ ]
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stemmers
4
+ VERSION = "0.0.1"
5
+ end
data/lib/stemmers.rb ADDED
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "stemmers/version"
4
+ require_relative "stemmers/stemmers"
5
+
6
+ module Stemmers
7
+ # Detects the language of the given text.
8
+ # If the language cannot be detected, it returns nil.
9
+ #
10
+ # @param text [String] The text to be analyzed.
11
+ # @return [String, nil] The detected language code or nil if undetectable.
12
+ def self.detect_language(text)
13
+ Bindings.detect_language(text)
14
+ end
15
+
16
+ # Detects if the language is supported by the stemmers.
17
+ #
18
+ # @param language [String] The language to check.
19
+ # @return [Boolean] True if the language is supported, false otherwise.
20
+ def self.supported_language?(language)
21
+ Bindings.supported_language?(language)
22
+ end
23
+
24
+ # Stems the given word in the specified language.
25
+ # If the language is not supported, it raises an `ArgumentError`.
26
+ #
27
+ # @param word [String] The word to be stemmed.
28
+ # @param language [String] The language of the word.
29
+ # @param lowercase [Boolean] If true, converts the word to lowercase before
30
+ # stemming.
31
+ # @param normalize [Boolean] If true, removes accents from the word after
32
+ # stemming.
33
+ # @return [String] The stemmed word.
34
+ def self.stem_word(word, language:, normalize: false, lowercase: false)
35
+ word = word.downcase if lowercase
36
+ stem = Bindings.stem_word(word, language)
37
+ stem = normalize_word(stem) if normalize
38
+
39
+ stem
40
+ end
41
+
42
+ # Stems the given phrase in the specified language.
43
+ # If the language is not supported, it raises an `ArgumentError`.
44
+ #
45
+ # @param phrase [String] The phrase to be stemmed.
46
+ # @param language [String] The language of the phrase.
47
+ # @param clean [Boolean] If true, removes stop words before stemming.
48
+ # @param normalize [Boolean] If true, removes accents from the phrase
49
+ # after stemming.
50
+ # @return [Array<String>] An array of stemmed words.
51
+ def self.stem(phrase, language:, clean: false, normalize: false)
52
+ words = phrase.downcase.strip.split(/\s+/)
53
+
54
+ if clean
55
+ stop_words = stop_words(language)
56
+ words = words.reject {|word| stop_words.include?(word) }
57
+ end
58
+
59
+ words.map {|word| stem_word(word, language:, normalize:) }
60
+ end
61
+
62
+ # Returns the stop words for the specified language.
63
+ # If the language is not supported, an empty list is returned.
64
+ #
65
+ # @param language [String] The language for which to retrieve stop words.
66
+ # @return [Array<String>] An array of stop words.
67
+ def self.stop_words(language)
68
+ stop_words_cache[language]
69
+ end
70
+
71
+ # Normalizes a word by removing accents and diacritics.
72
+ # This is useful for languages where accents do not change the meaning
73
+ # of the word, such as Portuguese.
74
+ #
75
+ # @param word [String] The word to be normalized.
76
+ # @return [String] The normalized word with accents removed.
77
+ def self.normalize_word(word)
78
+ word.unicode_normalize(:nfkd).gsub(/\p{M}/, "")
79
+ end
80
+
81
+ # Returns a cache of stop words loaded from a JSON file.
82
+ # The cache is initialized only once and reused for subsequent calls.
83
+ # @return [Hash<String, Array<String>>] A hash mapping language codes to
84
+ # arrays of stop words.
85
+ def self.stop_words_cache
86
+ @stop_words_cache ||= Hash.new do |hash, key|
87
+ path = File.join(__dir__, "stemmers/stopwords/#{key}.json")
88
+ hash[key] = File.file?(path) ? JSON.load_file(path) : []
89
+ end
90
+ end
91
+ end
data/sig/stemmers.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Stemmers
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,131 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: stemmers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nando Vieira
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rb_sys
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: 0.9.91
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: 0.9.91
26
+ description: Bindings some popular snowball stemming algorithms
27
+ email:
28
+ - me@fnando.com
29
+ executables: []
30
+ extensions:
31
+ - ext/stemmers/extconf.rb
32
+ extra_rdoc_files: []
33
+ files:
34
+ - ".rubocop.yml"
35
+ - CHANGELOG.md
36
+ - CODE_OF_CONDUCT.md
37
+ - Cargo.lock
38
+ - Cargo.toml
39
+ - LICENSE.txt
40
+ - README.md
41
+ - Rakefile
42
+ - ext/stemmers/Cargo.toml
43
+ - ext/stemmers/extconf.rb
44
+ - ext/stemmers/src/lib.rs
45
+ - lib/stemmers.rb
46
+ - lib/stemmers/stopwords/af.json
47
+ - lib/stemmers/stopwords/ar.json
48
+ - lib/stemmers/stopwords/bg.json
49
+ - lib/stemmers/stopwords/bn.json
50
+ - lib/stemmers/stopwords/br.json
51
+ - lib/stemmers/stopwords/ca.json
52
+ - lib/stemmers/stopwords/cs.json
53
+ - lib/stemmers/stopwords/da.json
54
+ - lib/stemmers/stopwords/de.json
55
+ - lib/stemmers/stopwords/el.json
56
+ - lib/stemmers/stopwords/en.json
57
+ - lib/stemmers/stopwords/eo.json
58
+ - lib/stemmers/stopwords/es.json
59
+ - lib/stemmers/stopwords/et.json
60
+ - lib/stemmers/stopwords/eu.json
61
+ - lib/stemmers/stopwords/fa.json
62
+ - lib/stemmers/stopwords/fi.json
63
+ - lib/stemmers/stopwords/fr.json
64
+ - lib/stemmers/stopwords/ga.json
65
+ - lib/stemmers/stopwords/gl.json
66
+ - lib/stemmers/stopwords/gu.json
67
+ - lib/stemmers/stopwords/ha.json
68
+ - lib/stemmers/stopwords/he.json
69
+ - lib/stemmers/stopwords/hi.json
70
+ - lib/stemmers/stopwords/hr.json
71
+ - lib/stemmers/stopwords/hu.json
72
+ - lib/stemmers/stopwords/hy.json
73
+ - lib/stemmers/stopwords/id.json
74
+ - lib/stemmers/stopwords/it.json
75
+ - lib/stemmers/stopwords/ja.json
76
+ - lib/stemmers/stopwords/ko.json
77
+ - lib/stemmers/stopwords/ku.json
78
+ - lib/stemmers/stopwords/la.json
79
+ - lib/stemmers/stopwords/lt.json
80
+ - lib/stemmers/stopwords/lv.json
81
+ - lib/stemmers/stopwords/mr.json
82
+ - lib/stemmers/stopwords/ms.json
83
+ - lib/stemmers/stopwords/nl.json
84
+ - lib/stemmers/stopwords/no.json
85
+ - lib/stemmers/stopwords/pl.json
86
+ - lib/stemmers/stopwords/pt.json
87
+ - lib/stemmers/stopwords/ro.json
88
+ - lib/stemmers/stopwords/ru.json
89
+ - lib/stemmers/stopwords/sk.json
90
+ - lib/stemmers/stopwords/sl.json
91
+ - lib/stemmers/stopwords/so.json
92
+ - lib/stemmers/stopwords/st.json
93
+ - lib/stemmers/stopwords/sv.json
94
+ - lib/stemmers/stopwords/sw.json
95
+ - lib/stemmers/stopwords/th.json
96
+ - lib/stemmers/stopwords/tl.json
97
+ - lib/stemmers/stopwords/tr.json
98
+ - lib/stemmers/stopwords/uk.json
99
+ - lib/stemmers/stopwords/ur.json
100
+ - lib/stemmers/stopwords/vi.json
101
+ - lib/stemmers/stopwords/yo.json
102
+ - lib/stemmers/stopwords/zh.json
103
+ - lib/stemmers/stopwords/zu.json
104
+ - lib/stemmers/version.rb
105
+ - sig/stemmers.rbs
106
+ homepage: https://github.com/fnando/stemmers
107
+ licenses:
108
+ - MIT
109
+ metadata:
110
+ homepage_uri: https://github.com/fnando/stemmers
111
+ source_code_uri: https://github.com/fnando/stemmers
112
+ changelog_uri: https://github.com/fnando/stemmers
113
+ rubygems_mfa_required: 'true'
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: 3.4.0
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: 3.3.11
127
+ requirements: []
128
+ rubygems_version: 3.6.7
129
+ specification_version: 4
130
+ summary: Bindings some popular snowball stemming algorithms
131
+ test_files: []