greeklish 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +66 -0
- data/Rakefile +2 -0
- data/greeklish.gemspec +24 -0
- data/lib/greeklish/greek_reverse_stemmer.rb +112 -0
- data/lib/greeklish/greeklish_converter.rb +83 -0
- data/lib/greeklish/greeklish_generator.rb +146 -0
- data/lib/greeklish/version.rb +3 -0
- data/lib/greeklish.rb +11 -0
- data/spec/greeklish_converter_spec.rb +98 -0
- data/spec/greeklish_generator_spec.rb +64 -0
- data/spec/greeklish_reverse_stemmer_spec.rb +46 -0
- data/spec/greeklish_spec.rb +14 -0
- data/spec/spec_helper.rb +91 -0
- metadata +109 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1ab370892f9c7e570ab8f9407d58534cdb3be66a
|
4
|
+
data.tar.gz: 36cfd3be6b235801e733adbccf7935917a12d929
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c5298e282e8a5b831086eda8a6156224a8c92a0ebb456a7b2980f7998ffac4b1faf53dc4736d5eb0f20e687bcd8b0b778d99f299ed4a6e5388940e55c332f74e
|
7
|
+
data.tar.gz: 83762172ec19a073afd9545dd097d81d5dece088ad0c5df28482fdf9c6be96237c9359dbf0808415a8f60178650433082f88246d609c86b2af5b8c88320f4c7c
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Petros Markou
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# Greeklish
|
2
|
+
|
3
|
+
Generate greeklish forms from Greek words.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'greeklish'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install greeklish
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
Obtain an instance of `GreeklishConverter` as follows:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
converter = Greeklish.converter(max_expansions: 2,
|
27
|
+
generate_greek_variants: false)
|
28
|
+
|
29
|
+
greeklish_words = converter.convert('ομπρελα') # => ["omprela", "obrela"]
|
30
|
+
|
31
|
+
```
|
32
|
+
|
33
|
+
The option `max_expansions` denotes the maximum greeklish expansions for
|
34
|
+
each greek word, i.e:
|
35
|
+
|
36
|
+
```ruby
|
37
|
+
converter = Greeklish.converter(max_expansions: 4,
|
38
|
+
generate_greek_variants: false)
|
39
|
+
|
40
|
+
converter.convert('αυτοκινητο') # =>
|
41
|
+
["autokinhto", "aftokinhto", "avtokinhto", "aytokinhto"]
|
42
|
+
```
|
43
|
+
|
44
|
+
The option `generate_greek_variants` denotes if greek variants should
|
45
|
+
be generated, i.e:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
converter = Greeklish.converter(max_expansions: 2,
|
49
|
+
generate_greek_variants: true)
|
50
|
+
|
51
|
+
converter.convert('αμαξι') # =>
|
52
|
+
["amaksi", "amaxi", "amaksiou", "amaxiou", "amaksia", "amaxia",
|
53
|
+
"amaksiwn", "amaxiwn"]
|
54
|
+
```
|
55
|
+
|
56
|
+
## Credits
|
57
|
+
|
58
|
+
Based on: [elasticsearch-analysis-greeklish](https://github.com/skroutz/elasticsearch-analysis-greeklish)
|
59
|
+
|
60
|
+
## Contributing
|
61
|
+
|
62
|
+
1. Fork it ( https://github.com/[my-github-username]/greeklish/fork )
|
63
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
64
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
65
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
66
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/greeklish.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'greeklish/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "greeklish"
|
8
|
+
spec.version = Greeklish::VERSION
|
9
|
+
spec.authors = ["Petros Markou"]
|
10
|
+
spec.email = ["markoupetr@skroutz.gr"]
|
11
|
+
spec.summary = %q{Generates greeklish forms}
|
12
|
+
spec.description = %q{Configurable generator of Greek words to greeklish forms.}
|
13
|
+
spec.homepage = "https://github.com/skroutz/greeklish"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "rspec", "~> 3.1.0"
|
24
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
module Greeklish
|
3
|
+
# Generates singular/plural variants of a greek word based on a
|
4
|
+
# combination of predefined rules.
|
5
|
+
class GreekReverseStemmer
|
6
|
+
|
7
|
+
# Constant variable that represents suffixes for pluralization
|
8
|
+
# of greeklish tokens.
|
9
|
+
SUFFIX_MATOS = "ματοσ"
|
10
|
+
SUFFIX_MATA = "ματα"
|
11
|
+
SUFFIX_MATWN = "ματων"
|
12
|
+
SUFFIX_AS = "ασ"
|
13
|
+
SUFFIX_EIA = "εια"
|
14
|
+
SUFFIX_EIO = "ειο"
|
15
|
+
SUFFIX_EIOY = "ειου"
|
16
|
+
SUFFIX_EIWN = "ειων"
|
17
|
+
SUFFIX_IOY = "ιου"
|
18
|
+
SUFFIX_IA = "ια"
|
19
|
+
SUFFIX_IWN = "ιων"
|
20
|
+
SUFFIX_OS = "οσ"
|
21
|
+
SUFFIX_OI = "οι"
|
22
|
+
SUFFIX_EIS = "εισ"
|
23
|
+
SUFFIX_ES = "εσ"
|
24
|
+
SUFFIX_HS = "ησ"
|
25
|
+
SUFFIX_WN = "ων"
|
26
|
+
SUFFIX_OY = "ου"
|
27
|
+
SUFFIX_O = "ο"
|
28
|
+
SUFFIX_H = "η"
|
29
|
+
SUFFIX_A = "α"
|
30
|
+
SUFFIX_I = "ι"
|
31
|
+
|
32
|
+
# The possible suffix strings.
|
33
|
+
SUFFIX_STRINGS = [
|
34
|
+
[SUFFIX_MATOS, "μα", "ματων", "ματα"],
|
35
|
+
[SUFFIX_MATA, "μα", "ματων", "ματοσ"],
|
36
|
+
[SUFFIX_MATWN, "μα", "ματα", "ματοσ"],
|
37
|
+
[SUFFIX_AS, "α", "ων", "εσ"],
|
38
|
+
[SUFFIX_EIA, "ειο", "ειων", "ειου", "ειασ"],
|
39
|
+
[SUFFIX_EIO, "εια", "ειων", "ειου"],
|
40
|
+
[SUFFIX_EIOY, "εια", "ειου", "ειο", "ειων"],
|
41
|
+
[SUFFIX_EIWN, "εια", "ειου", "ειο", "ειασ"],
|
42
|
+
[SUFFIX_IOY, "ι", "ια", "ιων", "ιο"],
|
43
|
+
[SUFFIX_IA, "ιου", "ι", "ιων", "ιασ", "ιο"],
|
44
|
+
[SUFFIX_IWN, "ιου", "ια", "ι", "ιο"],
|
45
|
+
[SUFFIX_OS, "η", "ουσ", "ου", "οι", "ων"],
|
46
|
+
[SUFFIX_OI, "οσ", "ου", "ων"],
|
47
|
+
[SUFFIX_EIS, "η", "ησ", "εων"],
|
48
|
+
[SUFFIX_ES, "η", "ασ", "ων", "ησ", "α"],
|
49
|
+
[SUFFIX_HS, "ων", "εσ", "η", "εων"],
|
50
|
+
[SUFFIX_WN, "οσ", "εσ", "α", "η", "ησ", "ου", "οι", "ο", "α"],
|
51
|
+
[SUFFIX_OY, "ων", "α", "ο", "οσ"],
|
52
|
+
[SUFFIX_O, "α", "ου", "εων", "ων"],
|
53
|
+
[SUFFIX_H, "οσ", "ουσ", "εων", "εισ", "ησ", "ων"],
|
54
|
+
[SUFFIX_A, "ο" , "ου", "ων", "ασ", "εσ"],
|
55
|
+
[SUFFIX_I, "ιου", "ια", "ιων"]
|
56
|
+
]
|
57
|
+
|
58
|
+
# This hash has as keys all the suffixes that we want to handle in order
|
59
|
+
# to generate singular/plural greek words.
|
60
|
+
attr_reader :suffixes
|
61
|
+
|
62
|
+
# The greek word list
|
63
|
+
attr_reader :greek_words
|
64
|
+
|
65
|
+
def initialize
|
66
|
+
@suffixes = {}
|
67
|
+
@greek_words = []
|
68
|
+
|
69
|
+
# populate suffixes
|
70
|
+
SUFFIX_STRINGS.each do |suffix|
|
71
|
+
key = suffix[0]
|
72
|
+
val = suffix[1..suffix.length]
|
73
|
+
@suffixes[key] = val
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# This method generates the greek variants of the greek token that
|
78
|
+
# receives.
|
79
|
+
#
|
80
|
+
# @param token_string the greek word
|
81
|
+
# @return a list of the generated greek word variations
|
82
|
+
def generate_greek_variants(token_string)
|
83
|
+
# clear the list from variations of the previous greek token
|
84
|
+
@greek_words.clear
|
85
|
+
|
86
|
+
# add the initial greek token in the greek words
|
87
|
+
@greek_words << token_string
|
88
|
+
|
89
|
+
# Find the first matching suffix and generate the variants
|
90
|
+
# of this word.
|
91
|
+
SUFFIX_STRINGS.each do |suffix|
|
92
|
+
if (token_string.end_with?(suffix[0]))
|
93
|
+
# Add to greek_words the tokens with the desired suffixes
|
94
|
+
generate_more_greek_words(token_string, suffix[0])
|
95
|
+
break
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
greek_words
|
100
|
+
end
|
101
|
+
|
102
|
+
# Generates more greek words based on the suffix of the original
|
103
|
+
# word.
|
104
|
+
#
|
105
|
+
# @param input_suffix the suffix that matched.
|
106
|
+
def generate_more_greek_words(input_token, input_suffix)
|
107
|
+
suffixes[input_suffix].each do |suffix|
|
108
|
+
@greek_words << input_token.gsub(/#{input_suffix}$/, suffix)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
module Greeklish
|
3
|
+
# Generates singular/plural variants of greek tokens and converts them
|
4
|
+
# to tokens with latin characters from which are matched to the
|
5
|
+
# corresponding greek characters. A Greek character may have one or more
|
6
|
+
# latin counterparts. so, from a Greek token one or more latin tokens are
|
7
|
+
# generated. Greek words have combination of vowels called digraphs. Because
|
8
|
+
# digraphs are special cases, they are treated separately.
|
9
|
+
class GreeklishConverter
|
10
|
+
|
11
|
+
# Tokens that contain only these characters will be affected by this
|
12
|
+
# filter.
|
13
|
+
GREEK_CHARACTERS = "αβγδεζηθικλμνξοπρστυφχψω"
|
14
|
+
|
15
|
+
# Keep the generated greek words from the greek reverse stemmer.
|
16
|
+
attr_reader :greek_words
|
17
|
+
|
18
|
+
# Input token converted into String.
|
19
|
+
attr_reader :token_string
|
20
|
+
|
21
|
+
# Instance of the reverse stemmer that generates the word variants
|
22
|
+
# of the greek token.
|
23
|
+
attr_reader :reverse_stemmer
|
24
|
+
|
25
|
+
# Instance of the greeklish generator that generates the greeklish
|
26
|
+
# words from the words that are returned by the greek reverse
|
27
|
+
# stemmer.
|
28
|
+
attr_reader :greeklish_generator
|
29
|
+
|
30
|
+
# Setting which is set in the configuration file that defines
|
31
|
+
# whether the user wants to generate greek variants.
|
32
|
+
attr_reader :generate_greek_variants
|
33
|
+
|
34
|
+
def initialize(max_expansions, generate_greek_variants)
|
35
|
+
@greek_words = []
|
36
|
+
@reverse_stemmer = GreekReverseStemmer.new
|
37
|
+
@greeklish_generator = GreeklishGenerator.new(max_expansions)
|
38
|
+
@generate_greek_variants = generate_greek_variants
|
39
|
+
end
|
40
|
+
|
41
|
+
# The actual conversion is happening here.
|
42
|
+
#
|
43
|
+
# @param input_token the Greek token
|
44
|
+
# @param token_length the length of the input token
|
45
|
+
# @return A list of the generated strings
|
46
|
+
def convert(input_token)
|
47
|
+
# Is this a Greek word?
|
48
|
+
if (!identify_greek_word(input_token))
|
49
|
+
return nil
|
50
|
+
end
|
51
|
+
|
52
|
+
# if generating greek variants is on
|
53
|
+
if (generate_greek_variants)
|
54
|
+
# generate them
|
55
|
+
@greek_words = reverse_stemmer.generate_greek_variants(input_token)
|
56
|
+
else
|
57
|
+
@greek_words << input_token
|
58
|
+
end
|
59
|
+
|
60
|
+
# if there are greek words
|
61
|
+
if (greek_words.size > 0)
|
62
|
+
# generate their greeklish version
|
63
|
+
return greeklish_generator.generate_greeklish_words(greek_words)
|
64
|
+
end
|
65
|
+
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
|
69
|
+
# Identifies words with only Greek lowercase characters.
|
70
|
+
#
|
71
|
+
# @param input The string that will examine
|
72
|
+
# @return true if the string contains only Greek characters
|
73
|
+
def identify_greek_word(input)
|
74
|
+
input.each_char do |char|
|
75
|
+
if (!GREEK_CHARACTERS.include?(char))
|
76
|
+
return false
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
true
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
module Greeklish
|
3
|
+
# Generates greeklish tokens that represent the character that
|
4
|
+
# substitutes a digraph.
|
5
|
+
class GreeklishGenerator
|
6
|
+
|
7
|
+
# Constant variables that represent the character that substitutes
|
8
|
+
# a digraph.
|
9
|
+
AI = "Α"
|
10
|
+
EI = "Ε"
|
11
|
+
OI = "Ο"
|
12
|
+
OY = "Υ"
|
13
|
+
EY = "Φ"
|
14
|
+
AY = "Β"
|
15
|
+
MP = "Μ"
|
16
|
+
GG = "Γ"
|
17
|
+
GK = "Κ"
|
18
|
+
NT = "Ν"
|
19
|
+
|
20
|
+
# Each digraph is replaced by a special capital Greek character.
|
21
|
+
attr_accessor :digraphs
|
22
|
+
|
23
|
+
# This hash has keys all the possible conversions that can be applied
|
24
|
+
# and values the strings that can replace the corresponding Greek
|
25
|
+
# character.
|
26
|
+
attr_accessor :conversions
|
27
|
+
|
28
|
+
# The possible digraph cases.
|
29
|
+
DIGRAPH_CASES = [
|
30
|
+
["αι", AI], ["ει", EI], ["οι", OI], ["ου", OY],
|
31
|
+
["ευ", EY], ["αυ", AY], ["μπ", MP], ["γγ", GG],
|
32
|
+
["γκ", GK], ["ντ", NT]
|
33
|
+
]
|
34
|
+
|
35
|
+
# The possible string conversions for each case.
|
36
|
+
CONVERT_STRINGS = [
|
37
|
+
[AI, "ai", "e"], [EI, "ei", "i"], [OI, "oi", "i"],
|
38
|
+
[OY, "ou", "oy", "u"], [EY, "eu", "ef", "ev", "ey"],
|
39
|
+
[AY, "au", "af", "av", "ay"], [MP, "mp", "b"],
|
40
|
+
[GG, "gg", "g"], [GK, "gk", "g"], [NT, "nt", "d"],
|
41
|
+
["α", "a"], ["β", "b", "v"], ["γ", "g"], ["δ", "d"],
|
42
|
+
["ε", "e"], ["ζ", "z"], ["η", "h", "i"], ["θ", "th"],
|
43
|
+
["ι", "i"], ["κ", "k"], ["λ", "l"], ["μ", "m"],
|
44
|
+
["ν", "n"], ["ξ", "ks", "x"], ["ο", "o"], ["π", "p"],
|
45
|
+
["ρ", "r"], ["σ", "s"], ["τ", "t"], ["υ", "y", "u", "i"],
|
46
|
+
["φ", "f", "ph"], ["χ", "x", "h", "ch"], ["ψ", "ps"],
|
47
|
+
["ω", "w", "o", "v"]
|
48
|
+
]
|
49
|
+
|
50
|
+
# The maximum greeklish expansions per greek token.
|
51
|
+
attr_reader :max_expansions
|
52
|
+
|
53
|
+
# A list of greeklish token per each greek word.
|
54
|
+
attr_reader :per_word_greeklish
|
55
|
+
|
56
|
+
# Keep the generated strings in a list. The populated
|
57
|
+
# list is returned to the filter.
|
58
|
+
attr_reader :greeklish_list
|
59
|
+
|
60
|
+
def initialize(max_expansions)
|
61
|
+
@max_expansions = max_expansions
|
62
|
+
@greeklish_list = []
|
63
|
+
@per_word_greeklish = []
|
64
|
+
@digraphs = {}
|
65
|
+
@conversions = Hash.new([])
|
66
|
+
|
67
|
+
# populate digraphs
|
68
|
+
DIGRAPH_CASES.each do |digraph_case|
|
69
|
+
key = digraph_case[0]
|
70
|
+
value = digraph_case[1]
|
71
|
+
@digraphs[key] = value
|
72
|
+
end
|
73
|
+
|
74
|
+
# populate conversions
|
75
|
+
CONVERT_STRINGS.each do |convert_string|
|
76
|
+
key = convert_string[0]
|
77
|
+
value = convert_string[1..convert_string.length]
|
78
|
+
@conversions[key] = value
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Gets a list of greek words and generates the greeklish version of
|
83
|
+
# each word.
|
84
|
+
#
|
85
|
+
# @param greek_words a list of greek words
|
86
|
+
# @return a list of greeklish words
|
87
|
+
def generate_greeklish_words(greek_words)
|
88
|
+
@greeklish_list.clear
|
89
|
+
|
90
|
+
greek_words.each do |greek_word|
|
91
|
+
@per_word_greeklish.clear
|
92
|
+
|
93
|
+
initial_token = greek_word
|
94
|
+
|
95
|
+
digraphs.each_key do |key|
|
96
|
+
greek_word = greek_word.gsub(key, digraphs[key])
|
97
|
+
end
|
98
|
+
|
99
|
+
# Convert it back to array of characters. The iterations of each
|
100
|
+
# character will take place through this array.
|
101
|
+
input_token = greek_word.split(//)
|
102
|
+
|
103
|
+
# Iterate through the characters of the token and generate
|
104
|
+
# greeklish words.
|
105
|
+
input_token.each do |greek_char|
|
106
|
+
add_character(conversions[greek_char])
|
107
|
+
end
|
108
|
+
|
109
|
+
@greeklish_list << per_word_greeklish.flatten
|
110
|
+
end
|
111
|
+
|
112
|
+
@greeklish_list.flatten
|
113
|
+
end
|
114
|
+
|
115
|
+
# Add the matching latin characters to the generated greeklish tokens
|
116
|
+
# for a specific Greek character. For each different combination of
|
117
|
+
# latin characters, a new token is generated.
|
118
|
+
#
|
119
|
+
# @param convert_strings the latin characters that will be added to the tokens
|
120
|
+
private
|
121
|
+
|
122
|
+
def add_character(convert_strings)
|
123
|
+
if (per_word_greeklish.empty?)
|
124
|
+
convert_strings.each do |convert_string|
|
125
|
+
if (per_word_greeklish.size >= max_expansions)
|
126
|
+
break
|
127
|
+
end
|
128
|
+
@per_word_greeklish << convert_string
|
129
|
+
end
|
130
|
+
else
|
131
|
+
new_tokens = []
|
132
|
+
|
133
|
+
convert_strings.each do |convert_string|
|
134
|
+
per_word_greeklish.each do |token|
|
135
|
+
if (new_tokens.size >= max_expansions)
|
136
|
+
break
|
137
|
+
end
|
138
|
+
new_tokens << "#{token}#{convert_string}"
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
@per_word_greeklish = new_tokens
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
data/lib/greeklish.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require "greeklish/version"
|
2
|
+
require "greeklish/greeklish_generator"
|
3
|
+
require "greeklish/greek_reverse_stemmer"
|
4
|
+
require "greeklish/greeklish_converter"
|
5
|
+
|
6
|
+
module Greeklish
|
7
|
+
def self.converter(options={})
|
8
|
+
GreeklishConverter.new(options[:max_expansions],
|
9
|
+
options[:generate_greek_variants])
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe 'GreeklishConverter' do
|
5
|
+
max_expansions = 10
|
6
|
+
generate_greek_variants = true
|
7
|
+
|
8
|
+
# a sample of greek words to generate their greeklish
|
9
|
+
# counterparts.
|
10
|
+
greek_words = ["αυτοκινητο", "ομπρελα", "ξεσκεπαστοσ"]
|
11
|
+
|
12
|
+
# the greeklish counterparts that should be generated from the greek words.
|
13
|
+
generated_greeklish_words = [
|
14
|
+
["autokinhto", "aftokinhto", "avtokinhto", "aytokinhto",
|
15
|
+
"autokinito", "aftokinito", "avtokinito", "aytokinito",
|
16
|
+
"autokinhtwn", "aftokinhta", "avtokinhta", "aytokinhtwn"],
|
17
|
+
["omprela", "obrela", "ompreles", "obrelwn", "obreles", "omprelas"],
|
18
|
+
["kseskepastos", "xeskepastos", "kseskepastou", "xeskepastwn", "kseskepastoi"]
|
19
|
+
]
|
20
|
+
|
21
|
+
# these words should not be processed by the converter.
|
22
|
+
invalid_words = ["mobile", "αυριο64", "καλάθι", "ΣΠιτι", "ομορφος" ]
|
23
|
+
|
24
|
+
before(:each) do
|
25
|
+
@greeklish_words = []
|
26
|
+
@converted_greeklish_strings = []
|
27
|
+
end
|
28
|
+
|
29
|
+
after(:each) do
|
30
|
+
@converted_greeklish_strings = []
|
31
|
+
@greeklish_words = []
|
32
|
+
end
|
33
|
+
|
34
|
+
it "does not convert invalid words" do
|
35
|
+
converter = Greeklish::GreeklishConverter.new(max_expansions, generate_greek_variants)
|
36
|
+
|
37
|
+
invalid_words.each do |invalid_word|
|
38
|
+
@greeklish_words = converter.convert(invalid_word)
|
39
|
+
expect(@greeklish_words.nil?).to eq(true)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
it "converts valid words" do
|
44
|
+
converter = Greeklish::GreeklishConverter.new(max_expansions, generate_greek_variants)
|
45
|
+
|
46
|
+
greek_words.each_with_index do |word, i|
|
47
|
+
@greeklish_words = converter.convert(greek_words[i])
|
48
|
+
populate_converted_strings_list
|
49
|
+
|
50
|
+
expect(@greeklish_words.empty?).to eq(false)
|
51
|
+
|
52
|
+
generated_greeklish_words[i].each do |greeklish_word|
|
53
|
+
expect(@converted_greeklish_strings.include?(greeklish_word)).to eq(true)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
it "respects max expansions" do
|
59
|
+
new_max_expansions = 2
|
60
|
+
generate_greek_variants = false
|
61
|
+
converter = Greeklish::GreeklishConverter.new(new_max_expansions, generate_greek_variants)
|
62
|
+
|
63
|
+
@greeklish_words = converter.convert(greek_words[0])
|
64
|
+
|
65
|
+
populate_converted_strings_list()
|
66
|
+
|
67
|
+
expect(@greeklish_words.size).to eq(new_max_expansions)
|
68
|
+
|
69
|
+
for i in 0..new_max_expansions-1 do
|
70
|
+
expect(@converted_greeklish_strings.include?(generated_greeklish_words[0][i])).to eq(true)
|
71
|
+
end
|
72
|
+
|
73
|
+
for j in new_max_expansions..generated_greeklish_words[0].length - 1 do
|
74
|
+
expect(@converted_greeklish_strings.include?(generated_greeklish_words[0][j])).to eq(false)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
it "respects variant generation" do
|
79
|
+
new_max_expansions = 1
|
80
|
+
generate_greek_variants = false
|
81
|
+
converter = Greeklish::GreeklishConverter.new(new_max_expansions, generate_greek_variants)
|
82
|
+
|
83
|
+
@greeklish_words = converter.convert(greek_words[0])
|
84
|
+
|
85
|
+
populate_converted_strings_list()
|
86
|
+
|
87
|
+
expect(@converted_greeklish_strings.include?(generated_greeklish_words[0][0])).to eq(true)
|
88
|
+
expect(@converted_greeklish_strings.include?(generated_greeklish_words[0][9])).to eq(false)
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def populate_converted_strings_list
|
94
|
+
@greeklish_words.each do |word|
|
95
|
+
@converted_greeklish_strings << word
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe 'GreeklishGenerator' do
|
5
|
+
max_expansions = 10
|
6
|
+
|
7
|
+
# a sample of greek words to generate their greeklish
|
8
|
+
# counterparts.
|
9
|
+
greek_words = ["αυτοκινητο", "ομπρελα", "ξεσκεπαστοσ"]
|
10
|
+
|
11
|
+
# the greeklish counterparts that should be generated
|
12
|
+
# from the greek words.
|
13
|
+
generated_greeklish_words = [
|
14
|
+
"autokinhto", "aftokinhto", "avtokinhto", "aytokinhto",
|
15
|
+
"autokinito", "aftokinito", "avtokinito", "aytokinito",
|
16
|
+
"omprela", "obrela", "kseskepastos", "xeskepastos"
|
17
|
+
]
|
18
|
+
|
19
|
+
before(:each) do
|
20
|
+
@input_greek_list = []
|
21
|
+
@greeklish_words = []
|
22
|
+
@converted_greeklish_strings = []
|
23
|
+
|
24
|
+
@generator = Greeklish::GreeklishGenerator.new(max_expansions)
|
25
|
+
greek_words.each do |word|
|
26
|
+
@input_greek_list << word
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
after(:each) do
|
31
|
+
@converted_greeklish_strings = []
|
32
|
+
end
|
33
|
+
|
34
|
+
it "converts valid words" do
|
35
|
+
greek_words.each do |word|
|
36
|
+
@greeklish_words = @generator.generate_greeklish_words(@input_greek_list)
|
37
|
+
|
38
|
+
populate_converted_strings_list
|
39
|
+
|
40
|
+
expect(@greeklish_words.empty?).to eq(false)
|
41
|
+
|
42
|
+
generated_greeklish_words.each do |greeklish_word|
|
43
|
+
expect(@converted_greeklish_strings.include?(greeklish_word)).to eq(true)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
it "respects the max expansion setting" do
|
49
|
+
@input_greek_list = []
|
50
|
+
new_max_expansions = 2
|
51
|
+
generator = Greeklish::GreeklishGenerator.new(new_max_expansions)
|
52
|
+
|
53
|
+
greeklish_words = generator.generate_greeklish_words(@input_greek_list)
|
54
|
+
expect(greeklish_words.size).to eq(new_max_expansions * @input_greek_list.size)
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def populate_converted_strings_list
|
60
|
+
@greeklish_words.each do |word|
|
61
|
+
@converted_greeklish_strings << word
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe 'GreeklishReverseStemmer' do
|
5
|
+
# Some greek words whose variations we want to produce.
|
6
|
+
greek_words = ["κουρεματοσ", "ενδυματα", "γραφειου", "πεδιου",
|
7
|
+
"γραναζι", "ποδηλατα", "καλωδιων"]
|
8
|
+
|
9
|
+
# Words that should not match to any rule.
|
10
|
+
non_matching_words = ["σουτιεν", "κολλαν", "αμπαλαζ", "μακιγιαζ"]
|
11
|
+
|
12
|
+
# The output we expect for each of the above words.
|
13
|
+
greek_variants = [
|
14
|
+
["κουρεμα", "κουρεματων", "κουρεματα"],
|
15
|
+
["ενδυμα", "ενδυματων", "ενδυματα", "ενδυματοσ"],
|
16
|
+
["γραφειο", "γραφεια", "γραφειων"],
|
17
|
+
["πεδια", "πεδιο", "πεδιων"],
|
18
|
+
["γραναζια", "γραναζιου", "γραναζιων"],
|
19
|
+
["ποδηλατο", "ποδηλατου", "ποδηλατα", "ποδηλατων"],
|
20
|
+
["καλωδιου", "καλωδια", "καλωδιο"]
|
21
|
+
]
|
22
|
+
|
23
|
+
before(:all) do
|
24
|
+
@reverse_stemmer = Greeklish::GreekReverseStemmer.new
|
25
|
+
end
|
26
|
+
|
27
|
+
it "produces greek variants" do
|
28
|
+
greek_words.each_with_index do |word, index|
|
29
|
+
generated_greek_variants = @reverse_stemmer.generate_greek_variants(word)
|
30
|
+
|
31
|
+
expect(generated_greek_variants.size > 1).to eq(true)
|
32
|
+
|
33
|
+
greek_variants[index].each do |greek_variant|
|
34
|
+
expect(generated_greek_variants.include?(greek_variant)).to eq(true)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
it "does not produce variants for non matching words" do
|
40
|
+
non_matching_words.each do |non_matching_word|
|
41
|
+
generated_greek_variants = @reverse_stemmer.generate_greek_variants(non_matching_word)
|
42
|
+
|
43
|
+
expect(generated_greek_variants.size).to eq(1)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe 'Greeklish' do
|
5
|
+
it "correctly converts to greeklish" do
|
6
|
+
converter = Greeklish.converter(max_expansions: 2,
|
7
|
+
generate_greek_variants: false)
|
8
|
+
|
9
|
+
words = converter.convert("ομπρελα")
|
10
|
+
|
11
|
+
expect(words.length).to eq(2)
|
12
|
+
expect(words).to include("omprela", "obrela")
|
13
|
+
end
|
14
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'greeklish'
|
2
|
+
|
3
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
4
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
5
|
+
# The generated `.rspec` file contains `--require spec_helper` which will cause this
|
6
|
+
# file to always be loaded, without a need to explicitly require it in any files.
|
7
|
+
#
|
8
|
+
# Given that it is always loaded, you are encouraged to keep this file as
|
9
|
+
# light-weight as possible. Requiring heavyweight dependencies from this file
|
10
|
+
# will add to the boot time of your test suite on EVERY test run, even for an
|
11
|
+
# individual file that may not need all of that loaded. Instead, consider making
|
12
|
+
# a separate helper file that requires the additional dependencies and performs
|
13
|
+
# the additional setup, and require it from the spec files that actually need it.
|
14
|
+
#
|
15
|
+
# The `.rspec` file also contains a few flags that are not defaults but that
|
16
|
+
# users commonly want.
|
17
|
+
#
|
18
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
19
|
+
RSpec.configure do |config|
|
20
|
+
# rspec-expectations config goes here. You can use an alternate
|
21
|
+
# assertion/expectation library such as wrong or the stdlib/minitest
|
22
|
+
# assertions if you prefer.
|
23
|
+
config.expect_with :rspec do |expectations|
|
24
|
+
# This option will default to `true` in RSpec 4. It makes the `description`
|
25
|
+
# and `failure_message` of custom matchers include text for helper methods
|
26
|
+
# defined using `chain`, e.g.:
|
27
|
+
# be_bigger_than(2).and_smaller_than(4).description
|
28
|
+
# # => "be bigger than 2 and smaller than 4"
|
29
|
+
# ...rather than:
|
30
|
+
# # => "be bigger than 2"
|
31
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
32
|
+
end
|
33
|
+
|
34
|
+
# rspec-mocks config goes here. You can use an alternate test double
|
35
|
+
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
36
|
+
config.mock_with :rspec do |mocks|
|
37
|
+
# Prevents you from mocking or stubbing a method that does not exist on
|
38
|
+
# a real object. This is generally recommended, and will default to
|
39
|
+
# `true` in RSpec 4.
|
40
|
+
mocks.verify_partial_doubles = true
|
41
|
+
end
|
42
|
+
|
43
|
+
# The settings below are suggested to provide a good initial experience
|
44
|
+
# with RSpec, but feel free to customize to your heart's content.
|
45
|
+
=begin
|
46
|
+
# These two settings work together to allow you to limit a spec run
|
47
|
+
# to individual examples or groups you care about by tagging them with
|
48
|
+
# `:focus` metadata. When nothing is tagged with `:focus`, all examples
|
49
|
+
# get run.
|
50
|
+
config.filter_run :focus
|
51
|
+
config.run_all_when_everything_filtered = true
|
52
|
+
|
53
|
+
# Limits the available syntax to the non-monkey patched syntax that is recommended.
|
54
|
+
# For more details, see:
|
55
|
+
# - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
|
56
|
+
# - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
57
|
+
# - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
|
58
|
+
config.disable_monkey_patching!
|
59
|
+
|
60
|
+
# This setting enables warnings. It's recommended, but in some cases may
|
61
|
+
# be too noisy due to issues in dependencies.
|
62
|
+
config.warnings = true
|
63
|
+
|
64
|
+
# Many RSpec users commonly either run the entire suite or an individual
|
65
|
+
# file, and it's useful to allow more verbose output when running an
|
66
|
+
# individual spec file.
|
67
|
+
if config.files_to_run.one?
|
68
|
+
# Use the documentation formatter for detailed output,
|
69
|
+
# unless a formatter has already been configured
|
70
|
+
# (e.g. via a command-line flag).
|
71
|
+
config.default_formatter = 'doc'
|
72
|
+
end
|
73
|
+
|
74
|
+
# Print the 10 slowest examples and example groups at the
|
75
|
+
# end of the spec run, to help surface which specs are running
|
76
|
+
# particularly slow.
|
77
|
+
config.profile_examples = 10
|
78
|
+
|
79
|
+
# Run specs in random order to surface order dependencies. If you find an
|
80
|
+
# order dependency and want to debug it, you can fix the order by providing
|
81
|
+
# the seed, which is printed after each run.
|
82
|
+
# --seed 1234
|
83
|
+
config.order = :random
|
84
|
+
|
85
|
+
# Seed global randomization in this process using the `--seed` CLI option.
|
86
|
+
# Setting this allows you to use `--seed` to deterministically reproduce
|
87
|
+
# test failures related to randomization by passing the same `--seed` value
|
88
|
+
# as the one that triggered the failure.
|
89
|
+
Kernel.srand config.seed
|
90
|
+
=end
|
91
|
+
end
|
metadata
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: greeklish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Petros Markou
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 3.1.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 3.1.0
|
55
|
+
description: Configurable generator of Greek words to greeklish forms.
|
56
|
+
email:
|
57
|
+
- markoupetr@skroutz.gr
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- ".gitignore"
|
63
|
+
- ".rspec"
|
64
|
+
- Gemfile
|
65
|
+
- LICENSE.txt
|
66
|
+
- README.md
|
67
|
+
- Rakefile
|
68
|
+
- greeklish.gemspec
|
69
|
+
- lib/greeklish.rb
|
70
|
+
- lib/greeklish/greek_reverse_stemmer.rb
|
71
|
+
- lib/greeklish/greeklish_converter.rb
|
72
|
+
- lib/greeklish/greeklish_generator.rb
|
73
|
+
- lib/greeklish/version.rb
|
74
|
+
- spec/greeklish_converter_spec.rb
|
75
|
+
- spec/greeklish_generator_spec.rb
|
76
|
+
- spec/greeklish_reverse_stemmer_spec.rb
|
77
|
+
- spec/greeklish_spec.rb
|
78
|
+
- spec/spec_helper.rb
|
79
|
+
homepage: https://github.com/skroutz/greeklish
|
80
|
+
licenses:
|
81
|
+
- MIT
|
82
|
+
metadata: {}
|
83
|
+
post_install_message:
|
84
|
+
rdoc_options: []
|
85
|
+
require_paths:
|
86
|
+
- lib
|
87
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: '0'
|
92
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
requirements: []
|
98
|
+
rubyforge_project:
|
99
|
+
rubygems_version: 2.4.6
|
100
|
+
signing_key:
|
101
|
+
specification_version: 4
|
102
|
+
summary: Generates greeklish forms
|
103
|
+
test_files:
|
104
|
+
- spec/greeklish_converter_spec.rb
|
105
|
+
- spec/greeklish_generator_spec.rb
|
106
|
+
- spec/greeklish_reverse_stemmer_spec.rb
|
107
|
+
- spec/greeklish_spec.rb
|
108
|
+
- spec/spec_helper.rb
|
109
|
+
has_rdoc:
|