slovene_stemmer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 22259aadb0a9aaf285250af01d541904c0397712ca27f67b8ba48a7bcf5a02e4
4
+ data.tar.gz: ace3157849d8f3d26bc99f8e22f77c544fc81f233cad2ed771e377849a1c2bbf
5
+ SHA512:
6
+ metadata.gz: 63f2ddea98119888780145b6da3d5ef80aeb8c364d4fce6371aa94b9dcd34d0641cc70683fb325352aff93abd3f5989b020da475e0070bcaa87a9c1b26ca69d3
7
+ data.tar.gz: 0f2496d0ca5cbfff34c2a94acd09e5af4823ac71f266dc474592475b4ba37490f7ca1312de9cf28f74fa843b33b593b2574876cb05c841afa5c24d39c6e7e6fe
@@ -0,0 +1,43 @@
1
+ require 'yaml'
2
+ require_relative './slovene_stemmer/stem'
3
+
4
+ # Inspired by: http://snowball.tartarus.org/archives/snowball-discuss/0725.html
5
+ module SloveneStemmer
6
+ extend self
7
+
8
+ def load_endings
9
+ config_path = File.expand_path("../../config/slovene_stemmer.yml", __FILE__)
10
+
11
+ YAML.load_file(config_path)['word_endings'].group_by(&:length)
12
+ rescue => e
13
+ raise "Please provide a valid config/stemmer.yml file, #{e}"
14
+ end
15
+
16
+ ALPHABET = 'abcčdefghijklmnoprsštuvzž'.freeze
17
+ VOWELS = 'aeiou'.freeze
18
+ CONSONANTS = ALPHABET.tr(VOWELS, '').freeze
19
+ WORD_ENDINGS = load_endings
20
+
21
+ def stem(word)
22
+ stem = Stem.new(word.strip)
23
+ stem.remove_symbols!
24
+
25
+ 4.times do
26
+ WORD_ENDINGS.each do |ending_length, endings|
27
+ next if stem.length <= ending_length + 3
28
+
29
+ stem.remove_last_char! if stem.ends_with?(endings)
30
+ end
31
+
32
+ if stem.length > 6 && stem.ends_with?(CONSONANTS.chars)
33
+ stem.remove_last_char!
34
+ end
35
+
36
+ if stem.length > 5 && stem.ends_with?(VOWELS.chars)
37
+ stem.remove_last_char!
38
+ end
39
+ end
40
+
41
+ stem.to_s
42
+ end
43
+ end
@@ -0,0 +1,27 @@
1
+ module SloveneStemmer
2
+ Stem = Struct.new(:stem) do
3
+ SYMBOLS = %w[, . ! : ; - _ ( ) ?].freeze
4
+
5
+ def ends_with?(ending_candidates)
6
+ ending_candidates.any? { |ending| stem.downcase.end_with?(ending) }
7
+ end
8
+
9
+ def remove_last_char!
10
+ self.stem = stem[0...-1]
11
+ end
12
+
13
+ def remove_symbols!
14
+ SYMBOLS.each do |symbol|
15
+ stem.gsub!(symbol, ' ')
16
+ end
17
+ end
18
+
19
+ def length
20
+ stem.length
21
+ end
22
+
23
+ def to_s
24
+ stem
25
+ end
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: slovene_stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Tadej Hribar
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-01-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: tadej.996@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/slovene_stemmer.rb
20
+ - lib/slovene_stemmer/stem.rb
21
+ homepage: https://rubygems.org/gems/slovene_stemmer
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubygems_version: 3.1.4
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: A basic stemmer for Slovene language.
44
+ test_files: []