slovene_stemmer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/slovene_stemmer.rb +43 -0
- data/lib/slovene_stemmer/stem.rb +27 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 22259aadb0a9aaf285250af01d541904c0397712ca27f67b8ba48a7bcf5a02e4
|
4
|
+
data.tar.gz: ace3157849d8f3d26bc99f8e22f77c544fc81f233cad2ed771e377849a1c2bbf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 63f2ddea98119888780145b6da3d5ef80aeb8c364d4fce6371aa94b9dcd34d0641cc70683fb325352aff93abd3f5989b020da475e0070bcaa87a9c1b26ca69d3
|
7
|
+
data.tar.gz: 0f2496d0ca5cbfff34c2a94acd09e5af4823ac71f266dc474592475b4ba37490f7ca1312de9cf28f74fa843b33b593b2574876cb05c841afa5c24d39c6e7e6fe
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require_relative './slovene_stemmer/stem'
|
3
|
+
|
4
|
+
# Inspired by: http://snowball.tartarus.org/archives/snowball-discuss/0725.html
|
5
|
+
module SloveneStemmer
|
6
|
+
extend self
|
7
|
+
|
8
|
+
def load_endings
|
9
|
+
config_path = File.expand_path("../../config/slovene_stemmer.yml", __FILE__)
|
10
|
+
|
11
|
+
YAML.load_file(config_path)['word_endings'].group_by(&:length)
|
12
|
+
rescue => e
|
13
|
+
raise "Please provide a valid config/stemmer.yml file, #{e}"
|
14
|
+
end
|
15
|
+
|
16
|
+
ALPHABET = 'abcčdefghijklmnoprsštuvzž'.freeze
|
17
|
+
VOWELS = 'aeiou'.freeze
|
18
|
+
CONSONANTS = ALPHABET.tr(VOWELS, '').freeze
|
19
|
+
WORD_ENDINGS = load_endings
|
20
|
+
|
21
|
+
def stem(word)
|
22
|
+
stem = Stem.new(word.strip)
|
23
|
+
stem.remove_symbols!
|
24
|
+
|
25
|
+
4.times do
|
26
|
+
WORD_ENDINGS.each do |ending_length, endings|
|
27
|
+
next if stem.length <= ending_length + 3
|
28
|
+
|
29
|
+
stem.remove_last_char! if stem.ends_with?(endings)
|
30
|
+
end
|
31
|
+
|
32
|
+
if stem.length > 6 && stem.ends_with?(CONSONANTS.chars)
|
33
|
+
stem.remove_last_char!
|
34
|
+
end
|
35
|
+
|
36
|
+
if stem.length > 5 && stem.ends_with?(VOWELS.chars)
|
37
|
+
stem.remove_last_char!
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
stem.to_s
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module SloveneStemmer
|
2
|
+
Stem = Struct.new(:stem) do
|
3
|
+
SYMBOLS = %w[, . ! : ; - _ ( ) ?].freeze
|
4
|
+
|
5
|
+
def ends_with?(ending_candidates)
|
6
|
+
ending_candidates.any? { |ending| stem.downcase.end_with?(ending) }
|
7
|
+
end
|
8
|
+
|
9
|
+
def remove_last_char!
|
10
|
+
self.stem = stem[0...-1]
|
11
|
+
end
|
12
|
+
|
13
|
+
def remove_symbols!
|
14
|
+
SYMBOLS.each do |symbol|
|
15
|
+
stem.gsub!(symbol, ' ')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def length
|
20
|
+
stem.length
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_s
|
24
|
+
stem
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: slovene_stemmer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tadej Hribar
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-01-02 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email: tadej.996@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/slovene_stemmer.rb
|
20
|
+
- lib/slovene_stemmer/stem.rb
|
21
|
+
homepage: https://rubygems.org/gems/slovene_stemmer
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubygems_version: 3.1.4
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: A basic stemmer for Slovene language.
|
44
|
+
test_files: []
|