alphabets 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,184 +1,186 @@
1
-
2
- class Alphabet
3
-
4
- ## "simple" unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping
5
- UNACCENT = Reader.parse( <<TXT )
6
- Ä A ä a
7
- Á A á a
8
- À A à a
9
- Ã A ã a
10
- Â A â a
11
- Å A å a
12
- Æ AE æ ae # ae ligature
13
- ā a
14
- ă a
15
- Ą A ą a # ą - U+0105 (261) - LATIN SMALL LETTER A WITH OGONEK
16
-
17
- Ç C ç c # ç - U+00E7 (231) - LATIN SMALL LETTER C WITH CEDILLA
18
- Ć C ć c
19
- Č C č c
20
-
21
- Ď D ď d
22
- Ð D ð d # iceland - d
23
-
24
- É E é e
25
- È E è e
26
- Ê E ê e
27
- Ë E ë e
28
- ė e
29
- Ę E ę e
30
- Ě E ě e
31
-
32
- ğ g
33
-
34
- İ I
35
- Í I í i
36
- Ì I ì i
37
- Î I î i
38
- ī i
39
- ı i # ı - U+0131 (305) - LATIN SMALL LETTER DOTLESS I
40
- Ï I ï i
41
-
42
- Ł L ł l
43
-
44
- Ñ N ñ n
45
- Ń N ń n
46
- Ň N ň n
47
-
48
- Ö O ö o
49
- Ő O ő o # hungarian - use OE/oe - why? (it's not a ligature) why not?
50
- Ó O ó o
51
- Ò O ò o
52
- Õ O õ o
53
- Ô O ô o
54
- ø o
55
- Œ OE œ oe # oe ligature
56
-
57
- Ř R ř r
58
-
59
- Ś S ś s
60
- Ş S ş s # ş - U+015F (351) - LATIN SMALL LETTER S WITH CEDILLA
61
- Ș S ș s # ș - U+0219 (537) - LATIN SMALL LETTER S WITH COMMA BELOW
62
- Š S š s
63
- ß ss # ß - U+00DF (223) - LATIN SMALL LETTER SHARP S
64
-
65
- Ţ T ţ t # ţ - U+0163 (355) - LATIN SMALL LETTER T WITH CEDILLA
66
- Ț T ț t # ț - U+021B (539) - LATIN SMALL LETTER T WITH COMMA BELOW
67
- Ť T ť t
68
-
69
- Þ P þ p # þ - U+00FE (254) - LATIN SMALL LETTER THORN
70
- #### fix/check!!!! icelandic - use p is p or th - why? why not?
71
-
72
- Ü U ü u
73
- Ú U ú u
74
- Ù U ù u
75
- ū u
76
- Ů U ů u
77
- Û U û u
78
-
79
- Ý Y ý y
80
- Ÿ Y ÿ y
81
-
82
- Ź Z ź z
83
- Ż Z ż z
84
- Ž Z ž z
85
- TXT
86
-
87
- ##
88
- # Notes:
89
- # Romanian did NOT initially get its Ș/ș and Ț/ț (with comma) letters,
90
- # because these letters were initially unified with Ş/ş and Ţ/ţ (with cedilla)
91
- # by the Unicode Consortium, considering the shapes with comma beneath
92
- # to be glyph variants of the shapes with cedilla.
93
- # However, the letters with explicit comma below were later added to the Unicode standard and are also in ISO 8859-16.
94
-
95
-
96
- ## de,at,ch translation for umlauts
97
- UNACCENT_DE = Reader.parse( <<TXT )
98
- Ä AE ä ae ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
99
- Ö OE ö oe
100
- Ü UE ü ue
101
- ß ss
102
- TXT
103
-
104
-
105
- ## add UNACCENT_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)?
106
- # 'ñ'=>'ny', ## e.g. Español => Espanyol
107
-
108
-
109
- DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
110
- h[ch] = ch.downcase
111
- h
112
- end.merge( Reader.parse( <<TXT ) )
113
- Ä ä
114
- Á á
115
- À à
116
- Â â
117
- Å å
118
- Æ æ # LATIN LETTER AE - ae ligature
119
- Ą ą
120
- Ã ã
121
-
122
- Ç ç # LATIN LETTER C WITH CEDILLA
123
- Č č
124
- Ć ć
125
-
126
- Ď ď
127
-
128
- Ð ð # iceland - d
129
-
130
- É é
131
- È è
132
- Ë ë
133
- Ê ê
134
- Ę ę
135
- Ě ě
136
-
137
- İ i
138
- Í í
139
- Ì ì
140
- Ï ï
141
- Î î
142
-
143
- Ł ł
144
-
145
- Ń ń
146
- Ň ň
147
- Ñ ñ
148
-
149
- Ö ö
150
- Ő ő
151
- Œ œ # LATIN LIGATURE OE
152
- Ó ó
153
- Ò ò
154
- Ô ô
155
- Õ õ
156
-
157
- Þ þ # iceland - p
158
-
159
- Ř ř
160
-
161
- Ś ś
162
- Ş ş # LATIN LETTER S WITH CEDILLA
163
- Ș ș # LATIN LETTER S WITH COMMA BELOW
164
- Š š
165
-
166
- Ţ ţ # LATIN LETTER T WITH CEDILLA
167
- Ț ț # LATIN LETTER T WITH COMMA BELOW
168
- Ť ť
169
-
170
- Ü ü
171
- Ú ú
172
- Ù ù
173
- Ů ů
174
- Û û
175
-
176
- Ý ý
177
- Ÿ ÿ
178
-
179
- Ž ž
180
- Ż ż
181
- Ź ź
182
- TXT
183
-
184
- end # class Alphabet
1
+
2
+ class Alphabet
3
+
4
+ ## "simple" unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping
5
+ UNACCENT = Reader.parse( <<TXT )
6
+ Ä A ä a
7
+ Á A á a
8
+ À A à a
9
+ Ã A ã a
10
+ Â A â a
11
+ Å A å a
12
+ Æ AE æ ae # ae ligature
13
+ ā a
14
+ ă a
15
+ Ą A ą a # ą - U+0105 (261) - LATIN SMALL LETTER A WITH OGONEK
16
+
17
+ Ç C ç c # ç - U+00E7 (231) - LATIN SMALL LETTER C WITH CEDILLA
18
+ Ć C ć c
19
+ Č C č c
20
+
21
+ Ď D ď d
22
+ Ð D ð d # iceland - d
23
+
24
+ É E é e
25
+ È E è e
26
+ Ê E ê e
27
+ Ë E ë e
28
+ ė e
29
+ Ę E ę e
30
+ Ě E ě e
31
+
32
+ ğ g
33
+
34
+ İ I
35
+ Í I í i
36
+ Ì I ì i
37
+ Î I î i
38
+ ī i
39
+ ı i # ı - U+0131 (305) - LATIN SMALL LETTER DOTLESS I
40
+ Ï I ï i
41
+
42
+ Ł L ł l
43
+
44
+ Ñ N ñ n
45
+ Ń N ń n
46
+ Ň N ň n
47
+ Ņ N ņ n # N/n with cedilla U+0145 / U+0146 (capital / small)
48
+
49
+ Ö O ö o
50
+ Ő O ő o # hungarian - use OE/oe - why? (it's not a ligature) why not?
51
+ Ó O ó o
52
+ Ò O ò o
53
+ Õ O õ o
54
+ Ô O ô o
55
+ ø o
56
+ Œ OE œ oe # oe ligature
57
+
58
+ Ř R ř r
59
+
60
+ Ś S ś s
61
+ Ş S ş s # ş - U+015F (351) - LATIN SMALL LETTER S WITH CEDILLA
62
+ Ș S ș s # ș - U+0219 (537) - LATIN SMALL LETTER S WITH COMMA BELOW
63
+ Š S š s
64
+ ß ss # ß - U+00DF (223) - LATIN SMALL LETTER SHARP S
65
+
66
+ Ţ T ţ t # ţ - U+0163 (355) - LATIN SMALL LETTER T WITH CEDILLA
67
+ Ț T ț t # ț - U+021B (539) - LATIN SMALL LETTER T WITH COMMA BELOW
68
+ Ť T ť t
69
+
70
+ Þ P þ p # þ - U+00FE (254) - LATIN SMALL LETTER THORN
71
+ #### fix/check!!!! icelandic - use p is p or th - why? why not?
72
+
73
+ Ü U ü u
74
+ Ú U ú u
75
+ Ù U ù u
76
+ ū u
77
+ Ů U ů u
78
+ Û U û u
79
+
80
+ Ý Y ý y
81
+ Ÿ Y ÿ y
82
+
83
+ Ź Z ź z
84
+ Ż Z ż z
85
+ Ž Z ž z
86
+ TXT
87
+
88
+ ##
89
+ # Notes:
90
+ # Romanian did NOT initially get its Ș/ș and Ț/ț (with comma) letters,
91
+ # because these letters were initially unified with Ş/ş and Ţ/ţ (with cedilla)
92
+ # by the Unicode Consortium, considering the shapes with comma beneath
93
+ # to be glyph variants of the shapes with cedilla.
94
+ # However, the letters with explicit comma below were later added to the Unicode standard and are also in ISO 8859-16.
95
+
96
+
97
+ ## de,at,ch translation for umlauts
98
+ UNACCENT_DE = Reader.parse( <<TXT )
99
+ Ä AE ä ae ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
100
+ Ö OE ö oe
101
+ Ü UE ü ue
102
+ ß ss
103
+ TXT
104
+
105
+
106
+ ## add UNACCENT_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)?
107
+ # 'ñ'=>'ny', ## e.g. Español => Espanyol
108
+
109
+
110
+ DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
111
+ h[ch] = ch.downcase
112
+ h
113
+ end.merge( Reader.parse( <<TXT ) )
114
+ Ä ä
115
+ Á á
116
+ À à
117
+ Â â
118
+ Å å
119
+ Æ æ # LATIN LETTER AE - ae ligature
120
+ Ą ą
121
+ Ã ã
122
+
123
+ Ç ç # LATIN LETTER C WITH CEDILLA
124
+ Č č
125
+ Ć ć
126
+
127
+ Ď ď
128
+
129
+ Ð ð # iceland - d
130
+
131
+ É é
132
+ È è
133
+ Ë ë
134
+ Ê ê
135
+ Ę ę
136
+ Ě ě
137
+
138
+ İ i
139
+ Í í
140
+ Ì ì
141
+ Ï ï
142
+ Î î
143
+
144
+ Ł ł
145
+
146
+ Ń ń
147
+ Ň ň
148
+ Ñ ñ
149
+ Ņ ņ
150
+
151
+ Ö ö
152
+ Ő ő
153
+ Œ œ # LATIN LIGATURE OE
154
+ Ó ó
155
+ Ò ò
156
+ Ô ô
157
+ Õ õ
158
+
159
+ Þ þ # iceland - p
160
+
161
+ Ř ř
162
+
163
+ Ś ś
164
+ Ş ş # LATIN LETTER S WITH CEDILLA
165
+ Ș ș # LATIN LETTER S WITH COMMA BELOW
166
+ Š š
167
+
168
+ Ţ ţ # LATIN LETTER T WITH CEDILLA
169
+ Ț ț # LATIN LETTER T WITH COMMA BELOW
170
+ Ť ť
171
+
172
+ Ü ü
173
+ Ú ú
174
+ Ù ù
175
+ Ů ů
176
+ Û û
177
+
178
+ Ý ý
179
+ Ÿ ÿ
180
+
181
+ Ž ž
182
+ Ż ż
183
+ Ź ź
184
+ TXT
185
+
186
+ end # class Alphabet
@@ -1,72 +1,72 @@
1
- # encoding: utf-8
2
-
3
-
4
- class Variant ## (spelling) variant finder / builder for names
5
-
6
- EN_UNACCENTER = Alphabet.find_unaccenter( :en ) ## assume english (en) as default for know - change to universal/int'l/default or something - why? why not?
7
- DE_UNACCENTER = Alphabet.find_unaccenter( :de )
8
-
9
- def self.find( name )
10
- alt_names = []
11
-
12
- freq = Alphabet.frequency_table( name )
13
-
14
- en = EN_UNACCENTER
15
- if en.count( freq ) > 0 # check if includes äöü (that is, character with accents or diacritics) etc.
16
- alt_names << en.unaccent( name )
17
- end
18
-
19
- de = DE_UNACCENTER
20
- if de.count( freq ) > 0
21
- alt_names << de.unaccent( name )
22
- end
23
-
24
- ## todo - make uniq e.g. Preußen is Preussen, Preussen 2x
25
- alt_names = alt_names.uniq
26
- alt_names
27
- end
28
-
29
- end # class Variant
30
-
31
-
32
-
33
- ######################################
34
- # expiremental class - use (just) Name or NameQ or NameVariant or NameAnalyzer/Query or similar - why? why not?
35
- ## let's wait for now with usage - let's add more methods as we go along and find more - why? why not?
36
- class NameQuery
37
- def initialize( name )
38
- @name = name
39
- end
40
-
41
- def frequency_table
42
- @freq ||= Alphabet.frequency_table( @name )
43
- end
44
-
45
- def variants
46
- @variants ||= find_variants
47
- end
48
-
49
- private
50
- EN_UNACCENTER = Alphabet.find_unaccenter( :en ) ## assume english (en) as default for know - change to universal/int'l/default or something - why? why not?
51
- DE_UNACCENTER = Alphabet.find_unaccenter( :de )
52
-
53
- def find_variants
54
- alt_names = []
55
-
56
- freq = frequency_table
57
-
58
- en = EN_UNACCENTER
59
- if en.count( freq ) > 0 # check if includes äöü (that is, character with accents or diacritics) etc.
60
- alt_names << en.unaccent( @name )
61
- end
62
-
63
- de = DE_UNACCENTER
64
- if de.count( freq ) > 0
65
- alt_names << de.unaccent( @name )
66
- end
67
-
68
- ## todo - make uniq e.g. Preußen is Preussen, Preussen 2x
69
- alt_names = alt_names.uniq
70
- alt_names
71
- end
72
- end ## class VariantName
1
+ # encoding: utf-8
2
+
3
+
4
+ class Variant ## (spelling) variant finder / builder for names
5
+
6
+ EN_UNACCENTER = Alphabet.find_unaccenter( :en ) ## assume english (en) as default for know - change to universal/int'l/default or something - why? why not?
7
+ DE_UNACCENTER = Alphabet.find_unaccenter( :de )
8
+
9
+ def self.find( name )
10
+ alt_names = []
11
+
12
+ freq = Alphabet.frequency_table( name )
13
+
14
+ en = EN_UNACCENTER
15
+ if en.count( freq ) > 0 # check if includes äöü (that is, character with accents or diacritics) etc.
16
+ alt_names << en.unaccent( name )
17
+ end
18
+
19
+ de = DE_UNACCENTER
20
+ if de.count( freq ) > 0
21
+ alt_names << de.unaccent( name )
22
+ end
23
+
24
+ ## todo - make uniq e.g. Preußen is Preussen, Preussen 2x
25
+ alt_names = alt_names.uniq
26
+ alt_names
27
+ end
28
+
29
+ end # class Variant
30
+
31
+
32
+
33
+ ######################################
34
+ # expiremental class - use (just) Name or NameQ or NameVariant or NameAnalyzer/Query or similar - why? why not?
35
+ ## let's wait for now with usage - let's add more methods as we go along and find more - why? why not?
36
+ class NameQuery
37
+ def initialize( name )
38
+ @name = name
39
+ end
40
+
41
+ def frequency_table
42
+ @freq ||= Alphabet.frequency_table( @name )
43
+ end
44
+
45
+ def variants
46
+ @variants ||= find_variants
47
+ end
48
+
49
+ private
50
+ EN_UNACCENTER = Alphabet.find_unaccenter( :en ) ## assume english (en) as default for know - change to universal/int'l/default or something - why? why not?
51
+ DE_UNACCENTER = Alphabet.find_unaccenter( :de )
52
+
53
+ def find_variants
54
+ alt_names = []
55
+
56
+ freq = frequency_table
57
+
58
+ en = EN_UNACCENTER
59
+ if en.count( freq ) > 0 # check if includes äöü (that is, character with accents or diacritics) etc.
60
+ alt_names << en.unaccent( @name )
61
+ end
62
+
63
+ de = DE_UNACCENTER
64
+ if de.count( freq ) > 0
65
+ alt_names << de.unaccent( @name )
66
+ end
67
+
68
+ ## todo - make uniq e.g. Preußen is Preussen, Preussen 2x
69
+ alt_names = alt_names.uniq
70
+ alt_names
71
+ end
72
+ end ## class VariantName
@@ -1,23 +1,23 @@
1
- # encoding: utf-8
2
-
3
-
4
- ## todo/check: use a module Alphabets with s to keep version and banner separate - why? why not?
5
-
6
- class Alphabet
7
- MAJOR = 1 ## todo: namespace inside version or something - why? why not??
8
- MINOR = 0
9
- PATCH = 0
10
- VERSION = [MAJOR,MINOR,PATCH].join('.')
11
-
12
- def self.version
13
- VERSION
14
- end
15
-
16
- def self.banner
17
- "alphabets/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
18
- end
19
-
20
- def self.root
21
- File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
22
- end
23
- end # class Alphabet
1
+ # encoding: utf-8
2
+
3
+
4
+ ## todo/check: use a module Alphabets with s to keep version and banner separate - why? why not?
5
+
6
+ class Alphabet
7
+ MAJOR = 1 ## todo: namespace inside version or something - why? why not??
8
+ MINOR = 0
9
+ PATCH = 1
10
+ VERSION = [MAJOR,MINOR,PATCH].join('.')
11
+
12
+ def self.version
13
+ VERSION
14
+ end
15
+
16
+ def self.banner
17
+ "alphabets/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
18
+ end
19
+
20
+ def self.root
21
+ File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
22
+ end
23
+ end # class Alphabet
data/lib/alphabets.rb CHANGED
@@ -1,40 +1,46 @@
1
- # encoding: utf-8
2
-
3
- require 'pp'
4
-
5
-
6
- ###
7
- # our own code
8
- require 'alphabets/version' # let version always go first
9
- require 'alphabets/reader'
10
- require 'alphabets/alphabets'
11
- require 'alphabets/utils'
12
- require 'alphabets/variants'
13
-
14
-
15
-
16
- ## add "global" convenience helper
17
- def downcase_i18n( name )
18
- Alphabet.downcase_i18n( name )
19
- end
20
-
21
- def unaccent( name )
22
- Alphabet.unaccent( name ) ## using "default" language character mapping / table
23
- end
24
-
25
- def undiacritic( name ) unaccent( name ); end ## alias for unaccent
26
-
27
-
28
-
29
- def variants( name ) ## todo/check: rename to unaccent_variants or unaccent_names - why? why not?
30
- Variant.find( name )
31
- end
32
-
33
-
34
- ## add convenience aliases - also add Alpha - why? why not?
35
- Abc = Alphabet
36
- Alphabets = Alphabet
37
- Alpha = Alphabet
38
-
39
-
40
- puts Alphabet.banner # say hello
1
+
2
+ ## note - pp now part of ruby core since version ???
3
+ ### check no longer required!!!
4
+ require 'pp'
5
+
6
+
7
+ ###
8
+ # our own code
9
+ require_relative 'alphabets/version' # let version always go first
10
+ require_relative 'alphabets/reader'
11
+ require_relative 'alphabets/alphabets'
12
+ require_relative 'alphabets/utils'
13
+ require_relative 'alphabets/variants'
14
+
15
+
16
+ ##
17
+ ## add globals inside Kernel ??
18
+ ## lets us use alias_method
19
+ ## what else? why? why not?
20
+
21
+
22
+ ## add "global" convenience helper
23
+ def downcase_i18n( name )
24
+ Alphabet.downcase_i18n( name )
25
+ end
26
+
27
+ def unaccent( name )
28
+ Alphabet.unaccent( name ) ## using "default" language character mapping / table
29
+ end
30
+
31
+ def undiacritic( name ) unaccent( name ); end ## alias for unaccent
32
+
33
+
34
+
35
+ def variants( name ) ## todo/check: rename to unaccent_variants or unaccent_names - why? why not?
36
+ Variant.find( name )
37
+ end
38
+
39
+
40
+ ## add convenience aliases - also add Alpha - why? why not?
41
+ Abc = Alphabet
42
+ Alphabets = Alphabet
43
+ Alpha = Alphabet
44
+
45
+
46
+ puts Alphabet.banner # say hello
data/test/helper.rb CHANGED
@@ -1,10 +1,10 @@
1
- ## $:.unshift(File.dirname(__FILE__))
2
-
3
- ## minitest setup
4
-
5
- require 'minitest/autorun'
6
-
7
-
8
- ## our own code
9
-
10
- require 'alphabets'
1
+ ## $:.unshift(File.dirname(__FILE__))
2
+
3
+ ## minitest setup
4
+
5
+ require 'minitest/autorun'
6
+
7
+
8
+ ## our own code
9
+
10
+ require 'alphabets'