ruby-stemmer 0.9.4 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +12 -0
- data/.travis.yml +6 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +23 -0
- data/MIT-LICENSE +1 -1
- data/README.rdoc +32 -33
- data/Rakefile +5 -6
- data/ext/lingua/extconf.rb +8 -10
- data/lib/lingua/stemmer.rb +13 -16
- data/lib/lingua/version.rb +5 -0
- data/libstemmer_c/Makefile +1 -1
- data/libstemmer_c/Makefile.windows +1 -1
- data/libstemmer_c/libstemmer/modules.h +22 -17
- data/libstemmer_c/libstemmer/modules_utf8.h +22 -17
- data/libstemmer_c/libstemmer/modules_utf8.txt +1 -0
- data/libstemmer_c/mkinc.mak +2 -0
- data/libstemmer_c/mkinc_utf8.mak +2 -0
- data/libstemmer_c/src_c/stem_UTF_8_lithuanian.c +909 -0
- data/libstemmer_c/src_c/stem_UTF_8_lithuanian.h +16 -0
- data/ruby-stemmer.gemspec +28 -0
- data/test/helper.rb +2 -3
- data/test/lingua/test_stemmer.rb +46 -46
- metadata +20 -27
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
/* This file was generated automatically by the Snowball to ANSI C compiler */
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
extern struct SN_env * lithuanian_UTF_8_create_env(void);
|
9
|
+
extern void lithuanian_UTF_8_close_env(struct SN_env * z);
|
10
|
+
|
11
|
+
extern int lithuanian_UTF_8_stem(struct SN_env * z);
|
12
|
+
|
13
|
+
#ifdef __cplusplus
|
14
|
+
}
|
15
|
+
#endif
|
16
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'lingua/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = 'ruby-stemmer'
|
9
|
+
s.version = Lingua::Stemmer::VERSION
|
10
|
+
|
11
|
+
s.platform = Gem::Platform::RUBY
|
12
|
+
s.required_ruby_version = '>= 2.4.0'
|
13
|
+
|
14
|
+
s.require_paths = ['lib']
|
15
|
+
s.authors = ['Aurelian Oancea', 'Yury Korolev']
|
16
|
+
|
17
|
+
s.description = 'Expose the bundled libstemmer_c library to Ruby.'
|
18
|
+
s.email = 'oancea@gmail.com'
|
19
|
+
s.extensions = ['ext/lingua/extconf.rb']
|
20
|
+
s.extra_rdoc_files = ['README.rdoc']
|
21
|
+
s.files = `git ls-files`.split("\n")
|
22
|
+
s.homepage = 'http://github.com/aurelian/ruby-stemmer'
|
23
|
+
s.licenses = ['MIT']
|
24
|
+
s.summary = 'Expose libstemmer_c to Ruby.'
|
25
|
+
|
26
|
+
s.add_development_dependency 'minitest', '~> 5.14'
|
27
|
+
s.add_development_dependency 'rake-compiler', '~> 1.1'
|
28
|
+
end
|
data/test/helper.rb
CHANGED
data/test/lingua/test_stemmer.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'helper'
|
3
4
|
|
4
5
|
class TestStemmer < Minitest::Test
|
5
|
-
|
6
6
|
def test_stemmer_creation
|
7
7
|
assert_kind_of ::Lingua::Stemmer, ::Lingua::Stemmer.new
|
8
8
|
end
|
@@ -10,51 +10,52 @@ class TestStemmer < Minitest::Test
|
|
10
10
|
def test_exceptions
|
11
11
|
assert_raises ::Lingua::StemmerError do
|
12
12
|
# invalid encoding for language
|
13
|
-
::Lingua::Stemmer.new :
|
13
|
+
::Lingua::Stemmer.new language: 'ro', encoding: 'ISO_8859_1'
|
14
14
|
end
|
15
15
|
assert_raises ::Lingua::StemmerError do
|
16
16
|
# invalid language
|
17
|
-
::Lingua::Stemmer.new :
|
17
|
+
::Lingua::Stemmer.new language: 'cat'
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_latin
|
22
|
-
::Lingua::Stemmer.new :
|
23
|
-
rescue StandardError =>
|
24
|
-
flunk "Expected latin to be loaded but failed with #{
|
22
|
+
::Lingua::Stemmer.new language: 'latin', encoding: 'ISO_8859_1'
|
23
|
+
rescue StandardError => e
|
24
|
+
flunk "Expected latin to be loaded but failed with #{e}"
|
25
25
|
end
|
26
26
|
|
27
27
|
def test_stem
|
28
|
-
|
29
|
-
assert_equal
|
30
|
-
assert_equal
|
28
|
+
stemmer = ::Lingua::Stemmer.new(language: 'en', encoding: 'UTF_8')
|
29
|
+
assert_equal stemmer.stem('obnoxious'), 'obnoxi'
|
30
|
+
assert_equal stemmer.stem('personalities'), 'person'
|
31
31
|
end
|
32
32
|
|
33
33
|
def test_string_stemmer
|
34
|
-
assert_equal ::Lingua.stemmer(
|
35
|
-
stemmer= ::Lingua.stemmer(
|
36
|
-
assert_equal word,
|
34
|
+
assert_equal ::Lingua.stemmer('installation', language: 'en'), 'instal'
|
35
|
+
stemmer = ::Lingua.stemmer('installation', language: 'fr') do |word|
|
36
|
+
assert_equal word, 'install'
|
37
37
|
end
|
38
38
|
assert_kind_of ::Lingua::Stemmer, stemmer
|
39
|
-
|
40
|
-
if RUBY_VERSION >= '1.9'
|
41
|
-
assert_equal stemmer.encoding, Encoding::UTF_8
|
42
|
-
else
|
43
|
-
assert_equal stemmer.encoding, "UTF_8"
|
44
|
-
end
|
39
|
+
assert_equal stemmer.encoding, Encoding::UTF_8
|
45
40
|
end
|
46
41
|
|
47
42
|
def test_array_stemmer
|
48
|
-
results= ::Lingua.stemmer([
|
43
|
+
results = ::Lingua.stemmer(%w[one two], language: 'de', encoding: 'ISO_8859_1')
|
49
44
|
assert_equal 2, results.size
|
50
45
|
assert_kind_of Array, results
|
51
46
|
end
|
52
47
|
|
48
|
+
def test_array_stemmer_issue_22
|
49
|
+
results = ::Lingua.stemmer(['one'], language: 'de', encoding: 'ISO_8859_1')
|
50
|
+
assert_equal 1, results.size
|
51
|
+
assert_kind_of Array, results
|
52
|
+
end
|
53
|
+
|
53
54
|
def test_stemmer_subclass
|
54
55
|
assert_raises(RuntimeError) do
|
55
|
-
Class.new(Lingua::Stemmer)
|
56
|
-
def native_init
|
57
|
-
|
56
|
+
Class.new(Lingua::Stemmer) do
|
57
|
+
def native_init(a, b); end
|
58
|
+
end.new.stem('cow')
|
58
59
|
end
|
59
60
|
end
|
60
61
|
|
@@ -62,38 +63,37 @@ class TestStemmer < Minitest::Test
|
|
62
63
|
if RUBY_VERSION >= '1.9'
|
63
64
|
assert_equal ::Lingua::Stemmer.new.encoding, Encoding::UTF_8
|
64
65
|
else
|
65
|
-
assert_equal ::Lingua::Stemmer.new.encoding,
|
66
|
+
assert_equal ::Lingua::Stemmer.new.encoding, 'UTF_8'
|
66
67
|
end
|
67
68
|
end
|
68
69
|
|
69
70
|
def test_different_encoding_options
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
assert_equal ::Lingua::Stemmer.new(:encoding => Encoding::UTF_8).encoding, Encoding::UTF_8
|
76
|
-
else
|
77
|
-
assert_equal ::Lingua::Stemmer.new(:encoding => "ISO_8859_1").encoding, "ISO_8859_1"
|
78
|
-
assert_equal ::Lingua::Stemmer.new(:encoding => "UTF-8").encoding, "UTF_8"
|
79
|
-
assert_equal ::Lingua::Stemmer.new(:encoding => "utf-8").encoding, "UTF_8"
|
80
|
-
assert_equal ::Lingua::Stemmer.new(:encoding => :ISO_8859_1).encoding, "ISO_8859_1"
|
81
|
-
end
|
71
|
+
assert_equal ::Lingua::Stemmer.new(encoding: 'ISO_8859_1').encoding, Encoding::ISO_8859_1
|
72
|
+
assert_equal ::Lingua::Stemmer.new(encoding: 'UTF-8').encoding, Encoding::UTF_8
|
73
|
+
assert_equal ::Lingua::Stemmer.new(encoding: 'utf-8').encoding, Encoding::UTF_8
|
74
|
+
assert_equal ::Lingua::Stemmer.new(encoding: :ISO_8859_1).encoding, Encoding::ISO_8859_1
|
75
|
+
assert_equal ::Lingua::Stemmer.new(encoding: Encoding::UTF_8).encoding, Encoding::UTF_8
|
82
76
|
end
|
83
77
|
|
84
|
-
|
85
|
-
|
86
|
-
word = "așezare"
|
78
|
+
def test_string_encoding
|
79
|
+
word = 'așezare'
|
87
80
|
|
88
|
-
|
89
|
-
|
81
|
+
stem = ::Lingua.stemmer(word, language: 'ro', encoding: 'UTF_8')
|
82
|
+
assert_equal word.encoding, stem.encoding
|
90
83
|
|
91
|
-
|
92
|
-
|
84
|
+
s = ::Lingua::Stemmer.new(language: 'ro', encoding: 'UTF_8')
|
85
|
+
assert_equal s.stem(word).encoding, word.encoding
|
93
86
|
|
94
|
-
|
95
|
-
|
96
|
-
end
|
87
|
+
stem = ::Lingua.stemmer('installation', language: 'fr', encoding: 'ISO-8859-1')
|
88
|
+
assert_equal stem.encoding, Encoding::ISO_8859_1
|
97
89
|
end
|
98
90
|
|
91
|
+
def test_lithuanian_stem
|
92
|
+
stemmer = ::Lingua::Stemmer.new(language: 'lt')
|
93
|
+
%w[
|
94
|
+
kompiuteris kompiuterio kompiuteriui kompiuteriu kompiuteri
|
95
|
+
].each do |word|
|
96
|
+
assert_equal stemmer.stem(word), 'kompiuter'
|
97
|
+
end
|
98
|
+
end
|
99
99
|
end
|
metadata
CHANGED
@@ -1,58 +1,44 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-stemmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aurelian Oancea
|
8
8
|
- Yury Korolev
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2020-10-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
-
- !ruby/object:Gem::Dependency
|
15
|
-
name: rake-compiler
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
17
|
-
requirements:
|
18
|
-
- - "~>"
|
19
|
-
- !ruby/object:Gem::Version
|
20
|
-
version: 0.9.2
|
21
|
-
type: :development
|
22
|
-
prerelease: false
|
23
|
-
version_requirements: !ruby/object:Gem::Requirement
|
24
|
-
requirements:
|
25
|
-
- - "~>"
|
26
|
-
- !ruby/object:Gem::Version
|
27
|
-
version: 0.9.2
|
28
14
|
- !ruby/object:Gem::Dependency
|
29
15
|
name: minitest
|
30
16
|
requirement: !ruby/object:Gem::Requirement
|
31
17
|
requirements:
|
32
18
|
- - "~>"
|
33
19
|
- !ruby/object:Gem::Version
|
34
|
-
version: 5.
|
20
|
+
version: '5.14'
|
35
21
|
type: :development
|
36
22
|
prerelease: false
|
37
23
|
version_requirements: !ruby/object:Gem::Requirement
|
38
24
|
requirements:
|
39
25
|
- - "~>"
|
40
26
|
- !ruby/object:Gem::Version
|
41
|
-
version: 5.
|
27
|
+
version: '5.14'
|
42
28
|
- !ruby/object:Gem::Dependency
|
43
|
-
name:
|
29
|
+
name: rake-compiler
|
44
30
|
requirement: !ruby/object:Gem::Requirement
|
45
31
|
requirements:
|
46
32
|
- - "~>"
|
47
33
|
- !ruby/object:Gem::Version
|
48
|
-
version:
|
34
|
+
version: '1.1'
|
49
35
|
type: :development
|
50
36
|
prerelease: false
|
51
37
|
version_requirements: !ruby/object:Gem::Requirement
|
52
38
|
requirements:
|
53
39
|
- - "~>"
|
54
40
|
- !ruby/object:Gem::Version
|
55
|
-
version:
|
41
|
+
version: '1.1'
|
56
42
|
description: Expose the bundled libstemmer_c library to Ruby.
|
57
43
|
email: oancea@gmail.com
|
58
44
|
executables: []
|
@@ -61,12 +47,17 @@ extensions:
|
|
61
47
|
extra_rdoc_files:
|
62
48
|
- README.rdoc
|
63
49
|
files:
|
50
|
+
- ".gitignore"
|
51
|
+
- ".travis.yml"
|
52
|
+
- Gemfile
|
53
|
+
- Gemfile.lock
|
64
54
|
- MIT-LICENSE
|
65
55
|
- README.rdoc
|
66
56
|
- Rakefile
|
67
57
|
- ext/lingua/extconf.rb
|
68
58
|
- ext/lingua/stemmer.c
|
69
59
|
- lib/lingua/stemmer.rb
|
60
|
+
- lib/lingua/version.rb
|
70
61
|
- libstemmer_c/MANIFEST
|
71
62
|
- libstemmer_c/Makefile
|
72
63
|
- libstemmer_c/Makefile.windows
|
@@ -135,6 +126,8 @@ files:
|
|
135
126
|
- libstemmer_c/src_c/stem_UTF_8_italian.h
|
136
127
|
- libstemmer_c/src_c/stem_UTF_8_latin.c
|
137
128
|
- libstemmer_c/src_c/stem_UTF_8_latin.h
|
129
|
+
- libstemmer_c/src_c/stem_UTF_8_lithuanian.c
|
130
|
+
- libstemmer_c/src_c/stem_UTF_8_lithuanian.h
|
138
131
|
- libstemmer_c/src_c/stem_UTF_8_norwegian.c
|
139
132
|
- libstemmer_c/src_c/stem_UTF_8_norwegian.h
|
140
133
|
- libstemmer_c/src_c/stem_UTF_8_porter.c
|
@@ -151,13 +144,14 @@ files:
|
|
151
144
|
- libstemmer_c/src_c/stem_UTF_8_swedish.h
|
152
145
|
- libstemmer_c/src_c/stem_UTF_8_turkish.c
|
153
146
|
- libstemmer_c/src_c/stem_UTF_8_turkish.h
|
147
|
+
- ruby-stemmer.gemspec
|
154
148
|
- test/helper.rb
|
155
149
|
- test/lingua/test_stemmer.rb
|
156
150
|
homepage: http://github.com/aurelian/ruby-stemmer
|
157
151
|
licenses:
|
158
152
|
- MIT
|
159
153
|
metadata: {}
|
160
|
-
post_install_message:
|
154
|
+
post_install_message:
|
161
155
|
rdoc_options: []
|
162
156
|
require_paths:
|
163
157
|
- lib
|
@@ -165,16 +159,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
165
159
|
requirements:
|
166
160
|
- - ">="
|
167
161
|
- !ruby/object:Gem::Version
|
168
|
-
version:
|
162
|
+
version: 2.4.0
|
169
163
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
170
164
|
requirements:
|
171
165
|
- - ">="
|
172
166
|
- !ruby/object:Gem::Version
|
173
167
|
version: '0'
|
174
168
|
requirements: []
|
175
|
-
|
176
|
-
|
177
|
-
signing_key:
|
169
|
+
rubygems_version: 3.2.0.rc.1
|
170
|
+
signing_key:
|
178
171
|
specification_version: 4
|
179
172
|
summary: Expose libstemmer_c to Ruby.
|
180
173
|
test_files: []
|