stemmers 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +13 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/Cargo.lock +547 -0
- data/Cargo.toml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +113 -0
- data/Rakefile +23 -0
- data/ext/stemmers/Cargo.toml +16 -0
- data/ext/stemmers/extconf.rb +6 -0
- data/ext/stemmers/src/lib.rs +105 -0
- data/lib/stemmers/stopwords/af.json +53 -0
- data/lib/stemmers/stopwords/ar.json +482 -0
- data/lib/stemmers/stopwords/bg.json +261 -0
- data/lib/stemmers/stopwords/bn.json +400 -0
- data/lib/stemmers/stopwords/br.json +1205 -0
- data/lib/stemmers/stopwords/ca.json +280 -0
- data/lib/stemmers/stopwords/cs.json +425 -0
- data/lib/stemmers/stopwords/da.json +172 -0
- data/lib/stemmers/stopwords/de.json +622 -0
- data/lib/stemmers/stopwords/el.json +849 -0
- data/lib/stemmers/stopwords/en.json +1300 -0
- data/lib/stemmers/stopwords/eo.json +175 -0
- data/lib/stemmers/stopwords/es.json +734 -0
- data/lib/stemmers/stopwords/et.json +37 -0
- data/lib/stemmers/stopwords/eu.json +100 -0
- data/lib/stemmers/stopwords/fa.json +801 -0
- data/lib/stemmers/stopwords/fi.json +849 -0
- data/lib/stemmers/stopwords/fr.json +693 -0
- data/lib/stemmers/stopwords/ga.json +111 -0
- data/lib/stemmers/stopwords/gl.json +162 -0
- data/lib/stemmers/stopwords/gu.json +226 -0
- data/lib/stemmers/stopwords/ha.json +41 -0
- data/lib/stemmers/stopwords/he.json +196 -0
- data/lib/stemmers/stopwords/hi.json +227 -0
- data/lib/stemmers/stopwords/hr.json +181 -0
- data/lib/stemmers/stopwords/hu.json +791 -0
- data/lib/stemmers/stopwords/hy.json +47 -0
- data/lib/stemmers/stopwords/id.json +760 -0
- data/lib/stemmers/stopwords/it.json +634 -0
- data/lib/stemmers/stopwords/ja.json +136 -0
- data/lib/stemmers/stopwords/ko.json +681 -0
- data/lib/stemmers/stopwords/ku.json +64 -0
- data/lib/stemmers/stopwords/la.json +51 -0
- data/lib/stemmers/stopwords/lt.json +476 -0
- data/lib/stemmers/stopwords/lv.json +163 -0
- data/lib/stemmers/stopwords/mr.json +101 -0
- data/lib/stemmers/stopwords/ms.json +477 -0
- data/lib/stemmers/stopwords/nl.json +415 -0
- data/lib/stemmers/stopwords/no.json +223 -0
- data/lib/stemmers/stopwords/pl.json +331 -0
- data/lib/stemmers/stopwords/pt.json +562 -0
- data/lib/stemmers/stopwords/ro.json +436 -0
- data/lib/stemmers/stopwords/ru.json +561 -0
- data/lib/stemmers/stopwords/sk.json +420 -0
- data/lib/stemmers/stopwords/sl.json +448 -0
- data/lib/stemmers/stopwords/so.json +32 -0
- data/lib/stemmers/stopwords/st.json +33 -0
- data/lib/stemmers/stopwords/sv.json +420 -0
- data/lib/stemmers/stopwords/sw.json +76 -0
- data/lib/stemmers/stopwords/th.json +118 -0
- data/lib/stemmers/stopwords/tl.json +149 -0
- data/lib/stemmers/stopwords/tr.json +506 -0
- data/lib/stemmers/stopwords/uk.json +75 -0
- data/lib/stemmers/stopwords/ur.json +519 -0
- data/lib/stemmers/stopwords/vi.json +647 -0
- data/lib/stemmers/stopwords/yo.json +62 -0
- data/lib/stemmers/stopwords/zh.json +796 -0
- data/lib/stemmers/stopwords/zu.json +31 -0
- data/lib/stemmers/version.rb +5 -0
- data/lib/stemmers.rb +91 -0
- data/sig/stemmers.rbs +4 -0
- metadata +131 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
[
|
2
|
+
"futhi",
|
3
|
+
"kahle",
|
4
|
+
"kakhulu",
|
5
|
+
"kanye",
|
6
|
+
"khona",
|
7
|
+
"kodwa",
|
8
|
+
"kungani",
|
9
|
+
"kusho",
|
10
|
+
"la",
|
11
|
+
"lakhe",
|
12
|
+
"lapho",
|
13
|
+
"mina",
|
14
|
+
"ngesikhathi",
|
15
|
+
"nje",
|
16
|
+
"phansi",
|
17
|
+
"phezulu",
|
18
|
+
"u",
|
19
|
+
"ukuba",
|
20
|
+
"ukuthi",
|
21
|
+
"ukuze",
|
22
|
+
"uma",
|
23
|
+
"wahamba",
|
24
|
+
"wakhe",
|
25
|
+
"wami",
|
26
|
+
"wase",
|
27
|
+
"wathi",
|
28
|
+
"yakhe",
|
29
|
+
"zakhe",
|
30
|
+
"zonke"
|
31
|
+
]
|
data/lib/stemmers.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "stemmers/version"
|
4
|
+
require_relative "stemmers/stemmers"
|
5
|
+
|
6
|
+
module Stemmers
|
7
|
+
# Detects the language of the given text.
|
8
|
+
# If the language cannot be detected, it returns nil.
|
9
|
+
#
|
10
|
+
# @param text [String] The text to be analyzed.
|
11
|
+
# @return [String, nil] The detected language code or nil if undetectable.
|
12
|
+
def self.detect_language(text)
|
13
|
+
Bindings.detect_language(text)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Detects if the language is supported by the stemmers.
|
17
|
+
#
|
18
|
+
# @param language [String] The language to check.
|
19
|
+
# @return [Boolean] True if the language is supported, false otherwise.
|
20
|
+
def self.supported_language?(language)
|
21
|
+
Bindings.supported_language?(language)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Stems the given word in the specified language.
|
25
|
+
# If the language is not supported, it raises an `ArgumentError`.
|
26
|
+
#
|
27
|
+
# @param word [String] The word to be stemmed.
|
28
|
+
# @param language [String] The language of the word.
|
29
|
+
# @param lowercase [Boolean] If true, converts the word to lowercase before
|
30
|
+
# stemming.
|
31
|
+
# @param normalize [Boolean] If true, removes accents from the word after
|
32
|
+
# stemming.
|
33
|
+
# @return [String] The stemmed word.
|
34
|
+
def self.stem_word(word, language:, normalize: false, lowercase: false)
|
35
|
+
word = word.downcase if lowercase
|
36
|
+
stem = Bindings.stem_word(word, language)
|
37
|
+
stem = normalize_word(stem) if normalize
|
38
|
+
|
39
|
+
stem
|
40
|
+
end
|
41
|
+
|
42
|
+
# Stems the given phrase in the specified language.
|
43
|
+
# If the language is not supported, it raises an `ArgumentError`.
|
44
|
+
#
|
45
|
+
# @param phrase [String] The phrase to be stemmed.
|
46
|
+
# @param language [String] The language of the phrase.
|
47
|
+
# @param clean [Boolean] If true, removes stop words before stemming.
|
48
|
+
# @param normalize [Boolean] If true, removes accents from the phrase
|
49
|
+
# after stemming.
|
50
|
+
# @return [Array<String>] An array of stemmed words.
|
51
|
+
def self.stem(phrase, language:, clean: false, normalize: false)
|
52
|
+
words = phrase.downcase.strip.split(/\s+/)
|
53
|
+
|
54
|
+
if clean
|
55
|
+
stop_words = stop_words(language)
|
56
|
+
words = words.reject {|word| stop_words.include?(word) }
|
57
|
+
end
|
58
|
+
|
59
|
+
words.map {|word| stem_word(word, language:, normalize:) }
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns the stop words for the specified language.
|
63
|
+
# If the language is not supported, an empty list is returned.
|
64
|
+
#
|
65
|
+
# @param language [String] The language for which to retrieve stop words.
|
66
|
+
# @return [Array<String>] An array of stop words.
|
67
|
+
def self.stop_words(language)
|
68
|
+
stop_words_cache[language]
|
69
|
+
end
|
70
|
+
|
71
|
+
# Normalizes a word by removing accents and diacritics.
|
72
|
+
# This is useful for languages where accents do not change the meaning
|
73
|
+
# of the word, such as Portuguese.
|
74
|
+
#
|
75
|
+
# @param word [String] The word to be normalized.
|
76
|
+
# @return [String] The normalized word with accents removed.
|
77
|
+
def self.normalize_word(word)
|
78
|
+
word.unicode_normalize(:nfkd).gsub(/\p{M}/, "")
|
79
|
+
end
|
80
|
+
|
81
|
+
# Returns a cache of stop words loaded from a JSON file.
|
82
|
+
# The cache is initialized only once and reused for subsequent calls.
|
83
|
+
# @return [Hash<String, Array<String>>] A hash mapping language codes to
|
84
|
+
# arrays of stop words.
|
85
|
+
def self.stop_words_cache
|
86
|
+
@stop_words_cache ||= Hash.new do |hash, key|
|
87
|
+
path = File.join(__dir__, "stemmers/stopwords/#{key}.json")
|
88
|
+
hash[key] = File.file?(path) ? JSON.load_file(path) : []
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/sig/stemmers.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: stemmers
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nando Vieira
|
8
|
+
bindir: exe
|
9
|
+
cert_chain: []
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: rb_sys
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: 0.9.91
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: 0.9.91
|
26
|
+
description: Bindings some popular snowball stemming algorithms
|
27
|
+
email:
|
28
|
+
- me@fnando.com
|
29
|
+
executables: []
|
30
|
+
extensions:
|
31
|
+
- ext/stemmers/extconf.rb
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- ".rubocop.yml"
|
35
|
+
- CHANGELOG.md
|
36
|
+
- CODE_OF_CONDUCT.md
|
37
|
+
- Cargo.lock
|
38
|
+
- Cargo.toml
|
39
|
+
- LICENSE.txt
|
40
|
+
- README.md
|
41
|
+
- Rakefile
|
42
|
+
- ext/stemmers/Cargo.toml
|
43
|
+
- ext/stemmers/extconf.rb
|
44
|
+
- ext/stemmers/src/lib.rs
|
45
|
+
- lib/stemmers.rb
|
46
|
+
- lib/stemmers/stopwords/af.json
|
47
|
+
- lib/stemmers/stopwords/ar.json
|
48
|
+
- lib/stemmers/stopwords/bg.json
|
49
|
+
- lib/stemmers/stopwords/bn.json
|
50
|
+
- lib/stemmers/stopwords/br.json
|
51
|
+
- lib/stemmers/stopwords/ca.json
|
52
|
+
- lib/stemmers/stopwords/cs.json
|
53
|
+
- lib/stemmers/stopwords/da.json
|
54
|
+
- lib/stemmers/stopwords/de.json
|
55
|
+
- lib/stemmers/stopwords/el.json
|
56
|
+
- lib/stemmers/stopwords/en.json
|
57
|
+
- lib/stemmers/stopwords/eo.json
|
58
|
+
- lib/stemmers/stopwords/es.json
|
59
|
+
- lib/stemmers/stopwords/et.json
|
60
|
+
- lib/stemmers/stopwords/eu.json
|
61
|
+
- lib/stemmers/stopwords/fa.json
|
62
|
+
- lib/stemmers/stopwords/fi.json
|
63
|
+
- lib/stemmers/stopwords/fr.json
|
64
|
+
- lib/stemmers/stopwords/ga.json
|
65
|
+
- lib/stemmers/stopwords/gl.json
|
66
|
+
- lib/stemmers/stopwords/gu.json
|
67
|
+
- lib/stemmers/stopwords/ha.json
|
68
|
+
- lib/stemmers/stopwords/he.json
|
69
|
+
- lib/stemmers/stopwords/hi.json
|
70
|
+
- lib/stemmers/stopwords/hr.json
|
71
|
+
- lib/stemmers/stopwords/hu.json
|
72
|
+
- lib/stemmers/stopwords/hy.json
|
73
|
+
- lib/stemmers/stopwords/id.json
|
74
|
+
- lib/stemmers/stopwords/it.json
|
75
|
+
- lib/stemmers/stopwords/ja.json
|
76
|
+
- lib/stemmers/stopwords/ko.json
|
77
|
+
- lib/stemmers/stopwords/ku.json
|
78
|
+
- lib/stemmers/stopwords/la.json
|
79
|
+
- lib/stemmers/stopwords/lt.json
|
80
|
+
- lib/stemmers/stopwords/lv.json
|
81
|
+
- lib/stemmers/stopwords/mr.json
|
82
|
+
- lib/stemmers/stopwords/ms.json
|
83
|
+
- lib/stemmers/stopwords/nl.json
|
84
|
+
- lib/stemmers/stopwords/no.json
|
85
|
+
- lib/stemmers/stopwords/pl.json
|
86
|
+
- lib/stemmers/stopwords/pt.json
|
87
|
+
- lib/stemmers/stopwords/ro.json
|
88
|
+
- lib/stemmers/stopwords/ru.json
|
89
|
+
- lib/stemmers/stopwords/sk.json
|
90
|
+
- lib/stemmers/stopwords/sl.json
|
91
|
+
- lib/stemmers/stopwords/so.json
|
92
|
+
- lib/stemmers/stopwords/st.json
|
93
|
+
- lib/stemmers/stopwords/sv.json
|
94
|
+
- lib/stemmers/stopwords/sw.json
|
95
|
+
- lib/stemmers/stopwords/th.json
|
96
|
+
- lib/stemmers/stopwords/tl.json
|
97
|
+
- lib/stemmers/stopwords/tr.json
|
98
|
+
- lib/stemmers/stopwords/uk.json
|
99
|
+
- lib/stemmers/stopwords/ur.json
|
100
|
+
- lib/stemmers/stopwords/vi.json
|
101
|
+
- lib/stemmers/stopwords/yo.json
|
102
|
+
- lib/stemmers/stopwords/zh.json
|
103
|
+
- lib/stemmers/stopwords/zu.json
|
104
|
+
- lib/stemmers/version.rb
|
105
|
+
- sig/stemmers.rbs
|
106
|
+
homepage: https://github.com/fnando/stemmers
|
107
|
+
licenses:
|
108
|
+
- MIT
|
109
|
+
metadata:
|
110
|
+
homepage_uri: https://github.com/fnando/stemmers
|
111
|
+
source_code_uri: https://github.com/fnando/stemmers
|
112
|
+
changelog_uri: https://github.com/fnando/stemmers
|
113
|
+
rubygems_mfa_required: 'true'
|
114
|
+
rdoc_options: []
|
115
|
+
require_paths:
|
116
|
+
- lib
|
117
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: 3.4.0
|
122
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: 3.3.11
|
127
|
+
requirements: []
|
128
|
+
rubygems_version: 3.6.7
|
129
|
+
specification_version: 4
|
130
|
+
summary: Bindings some popular snowball stemming algorithms
|
131
|
+
test_files: []
|