alphabets 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2679f375c1118915625e06811cf773a4d59707a6
4
- data.tar.gz: 5375336c3b4d0002923547f5a9d41498d629722f
3
+ metadata.gz: b5d826c435c38e5c8faf7963d7de6e6dcf0e2fb7
4
+ data.tar.gz: 5187392b8e6fbb12e249709526edf1bb2def2513
5
5
  SHA512:
6
- metadata.gz: 3ff7470fe10524e8b0080cc36c9eb7a2534f70f75a4f47ca1cf46698bb2cf68a8845b45a364c01a65469bfa3d42f7865172aff8217103766043675b30924bf7a
7
- data.tar.gz: '018796bf5c80bc970458716cf297f53616fb2ee1a031e91fa184d5351483564760a688e283368d760838832350b683d6951c4ce857eaa168177269ae451c10dd'
6
+ metadata.gz: 0e8aac5a5a65d137c710a9623d444d9f996171caee9869cb32af78cd09b26ac0404fa88a40499279527ba6d9276c029396a34241f507adf289551e9327a6ce05
7
+ data.tar.gz: 7cb3dda8f5804fc39f67c866c319b24af8ee6119609052a7eb7ff6c0927f1ede8b50cbcf89d98709d83ba72bc900d56efafb4ce038d4f80554c916f057b6b5ec
@@ -5,9 +5,12 @@ README.md
5
5
  Rakefile
6
6
  lib/alphabets.rb
7
7
  lib/alphabets/alphabets.rb
8
+ lib/alphabets/reader.rb
9
+ lib/alphabets/utils.rb
8
10
  lib/alphabets/variants.rb
9
11
  lib/alphabets/version.rb
10
12
  test/helper.rb
11
13
  test/test_downcase.rb
14
+ test/test_reader.rb
12
15
  test/test_unaccent.rb
13
16
  test/test_variants.rb
data/NOTES.md CHANGED
@@ -1,3 +1,43 @@
1
1
  # Notes
2
2
 
3
3
  ## Todos
4
+
5
+
6
+ ## Terminology
7
+
8
+ Use Upcase, Downcase AND Titlecase (!)
9
+
10
+ - Example: Ö -> Upcase: OE, Downcase: oe, Titlecase: Oe (!)
11
+ - Example: Æ -> Upcase: AE, Downcase: ae, Titlecase: Ae (!)
12
+
13
+
14
+
15
+ ## Libraries
16
+
17
+ - <https://github.com/SixArm/sixarm_ruby_unaccent> - Replace a string's accent characters with ASCII characters. Based on Perl Text::Unaccent from CPAN.
18
+
19
+
20
+
21
+ ## Links
22
+
23
+ **Unicode w/ Ruby - Ruby ♡ Unicode**
24
+
25
+ - <https://idiosyncratic-ruby.com/66-ruby-has-character>
26
+
27
+ Ruby has Character - Ruby comes with good support for Unicode-related features. Read on if you want to learn more about important Unicode fundamentals and how to use them in Ruby...
28
+
29
+ - <https://idiosyncratic-ruby.com/41-proper-unicoding>
30
+
31
+ Proper Unicoding - Ruby's Regexp engine has a powerful feature built in: It can match for Unicode character properties. But what exactly are properties you can match for?
32
+
33
+ - <https://idiosyncratic-ruby.com/30-regex-with-class>
34
+
35
+ Regex with Class - Ruby's regex engine defines a lot of shortcut character classes. Besides the common meta characters (\w, etc.), there is also the POSIX style expressions and the unicode property syntax. This is an overview of all character classes
36
+
37
+
38
+ **W3C**
39
+
40
+ - <https://www.w3.org/TR/charmod-norm/>
41
+ - <https://www.w3.org/International/wiki/Case_folding>
42
+
43
+ In Western European languages, the letter 'i' (U+0069) upper cases to a dotless 'I' (U+0049). In Turkish, this letter upper cases to a dotted upper case letter 'İ' (U+0130). Similarly, 'I' (U+0049) lower cases to 'ı' (U+0131), which is a dotless lowercase letter i.
@@ -6,7 +6,9 @@ require 'pp'
6
6
  ###
7
7
  # our own code
8
8
  require 'alphabets/version' # let version always go first
9
+ require 'alphabets/reader'
9
10
  require 'alphabets/alphabets'
11
+ require 'alphabets/utils'
10
12
  require 'alphabets/variants'
11
13
 
12
14
 
@@ -20,6 +22,10 @@ def unaccent( name )
20
22
  Alphabet.unaccent( name ) ## using "default" language character mapping / table
21
23
  end
22
24
 
25
+ def undiacritic( name ) unaccent( name ); end ## alias for unaccent
26
+
27
+
28
+
23
29
  def variants( name ) ## todo/check: rename to unaccent_variants or unaccent_names - why? why not?
24
30
  Variant.find( name )
25
31
  end
@@ -1,186 +1,118 @@
1
- # encoding: utf-8
2
-
3
- class Alphabet ## todo/fix: add alias Abc and Alpha too? why? why not?
4
- def self.frequency_table( name ) ## todo/check: use/rename to char_frequency_table
5
- ## calculate the frequency table of letters, digits, etc.
6
- freq = Hash.new(0)
7
- name.each_char do |ch|
8
- freq[ch] += 1
9
- end
10
- freq
11
- end
12
-
13
-
14
- def self.count( freq, mapping_or_chars )
15
- chars = if mapping_or_chars.is_a?( Hash )
16
- mapping_or_chars.keys
17
- else ## todo/fix: check for is_a? Array and if is String split into Array (on char at a time?) - why? why not?
18
- mapping_or_chars ## assume it's an array/list of characters
19
- end
20
-
21
- chars.reduce(0) do |count,ch|
22
- count += freq[ch]
23
- count
24
- end
25
- end
26
-
27
-
28
- def self.tr( name, mapping )
29
- buf = String.new
30
- name.each_char do |ch|
31
- buf << if mapping[ch]
32
- mapping[ch]
33
- else
34
- ch
35
- end
36
- end
37
- buf
38
- end
39
-
40
-
41
- class Unaccenter #Worker ## todo/change - find a better name - why? why not?
42
- def initialize( mapping )
43
- @mapping = mapping
44
- end
45
-
46
- def count( name ) Alphabet.count( name, @mapping ); end
47
- def unaccent( name ) Alphabet.tr( name, @mapping ); end
48
- end # class Unaccent Worker
49
-
50
-
51
- def self.find_unaccenter( key )
52
- if key == :de
53
- @de ||= Unaccenter.new( UNACCENT_DE )
54
- @de
55
- else
56
- ## use uni(versal) or unicode or something - why? why not?
57
- ## use all or int'l (international) - why? why not?
58
- ## use en (english) - why? why not?
59
- @default ||= Unaccenter.new( UNACCENT )
60
- @default
61
- end
62
- end
63
-
64
- def self.unaccent( name )
65
- @default ||= Unaccenter.new( UNACCENT )
66
- @default.unaccent( name )
67
- end
68
-
69
-
70
- def self.downcase_i18n( name ) ## our very own downcase for int'l characters / letters
71
- tr( name, DOWNCASE )
72
- end
73
- ## add downcase_uni - univeral/unicode - why? why not?
74
-
75
-
76
- ## "simple" unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping
77
- UNACCENT = {
78
- 'Ä'=>'A', 'ä'=>'a',
79
- 'Á'=>'A', 'á'=>'a',
80
- 'à'=>'a',
81
- 'ã'=>'a',
82
- 'â'=>'a',
83
- 'Å'=>'A', 'å'=>'a',
84
- 'æ'=>'ae',
85
- 'ā'=>'a',
86
- 'ă'=>'a',
87
- 'ą'=>'a',
88
-
89
- 'Ç' =>'C', 'ç'=>'c',
90
- 'ć'=>'c',
91
- 'Č'=>'C', 'č'=>'c',
92
-
93
- 'É'=>'E', 'é'=>'e',
94
- 'è'=>'e',
95
- 'ê'=>'e',
96
- 'ë'=>'e',
97
- 'ė'=>'e',
98
- 'ę'=>'e',
99
-
100
- 'ğ'=>'g',
101
-
102
- 'İ'=>'I',
103
- 'Í'=>'I', 'í'=>'i',
104
- 'î'=>'i',
105
- 'ī'=>'i',
106
- 'ı'=>'i',
107
-
108
- 'Ł'=>'L', 'ł'=>'l',
109
-
110
- 'ñ'=>'n',
111
- 'ń'=>'n',
112
- 'ň'=>'n',
113
-
114
- 'Ö'=>'O', 'ö'=>'o',
115
- 'ó'=>'o',
116
- 'õ'=>'o',
117
- 'ô'=>'o',
118
- 'ø'=>'o',
119
- 'ő'=>'o',
120
-
121
- 'ř'=>'r',
122
-
123
- 'Ś'=>'S',
124
- 'Ş'=>'S', 'ş'=>'s',
125
- 'Š'=>'S', 'š'=>'s',
126
- 'ș'=>'s', ## U+0219
127
- 'ß'=>'ss',
128
-
129
- 'ţ'=>'t', ## U+0163
130
- 'ț'=>'t', ## U+021B
131
- 'þ'=>'th', #### fix!!!! use p - why? why not?
132
-
133
- 'Ü'=>'U', 'ü'=>'u',
134
- 'Ú'=>'U', 'ú'=>'u',
135
- 'ū'=>'u',
136
-
137
- 'ý'=>'y',
138
-
139
- 'ź'=>'z',
140
- 'ż'=>'z',
141
- 'Ž'=>'Z', 'ž'=>'z',
142
- }
143
-
144
-
145
- ## de,at,ch translation for umlauts
146
- UNACCENT_DE = {
147
- 'Ä'=>'Ae', 'ä'=>'ae', ### Use AE, OE, UE and NOT Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
148
- 'Ö'=>'Oe', 'ö'=>'oe',
149
- 'Ü'=>'Ue', 'ü'=>'ue',
150
- 'ß'=>'ss',
151
- }
1
+
2
+ class Alphabet
3
+
4
+ ## "simple" unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping
5
+ UNACCENT = Reader.parse( <<TXT )
6
+ Ä A ä a
7
+ Á A á a
8
+ à a
9
+ ã a
10
+ â a
11
+ Å A å a
12
+ Æ AE æ ae # ae ligature
13
+ ā a
14
+ ă a
15
+ ą a
16
+
17
+ Ç C ç c
18
+ ć c
19
+ Č C č c
20
+
21
+ É E é e
22
+ è e
23
+ ê e
24
+ ë e
25
+ ė e
26
+ ę e
27
+
28
+ ğ g
29
+
30
+ İ I
31
+ Í I í i
32
+ î i
33
+ ī i
34
+ ı i # small dotless i
35
+
36
+ Ł L ł l
37
+
38
+ ñ n
39
+ ń n
40
+ ň n
41
+
42
+ Ö O ö o
43
+ ó o
44
+ õ o
45
+ ô o
46
+ ø o
47
+ ő o
48
+ Œ OE œ oe # oe ligature
49
+
50
+ ř r
51
+
52
+ Ś S ś s
53
+ Ş S ş s
54
+ Š S š s
55
+ ș s # U+0219
56
+ ß ss
57
+
58
+ ţ t # U+0163
59
+ ț t # U+021B
60
+
61
+ þ p #### fix/check!!!! icelandic - use p is p or th - why? why not?
62
+
63
+ Ü U ü u
64
+ Ú U ú u
65
+ ū u
66
+
67
+ ý y
68
+
69
+ ź z
70
+ ż z
71
+ Ž Z ž z
72
+ TXT
73
+
74
+
75
+ ## de,at,ch translation for umlauts
76
+ UNACCENT_DE = Reader.parse( <<TXT )
77
+ Ä AE ä ae ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
78
+ Ö OE ö oe
79
+ Ü UE ü ue
80
+ ß ss
81
+ TXT
152
82
 
153
83
  ## add UNACCENT_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)?
154
84
  # 'ñ'=>'ny', ## e.g. Español => Espanyol
155
85
 
156
- DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
86
+ DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
157
87
  h[ch] = ch.downcase
158
88
  h
159
- end.merge(
160
- 'Ä'=>'ä',
161
- 'Á'=>'á',
162
- 'Å'=>'å',
89
+ end.merge( Reader.parse( <<TXT ) )
90
+ Ä ä
91
+ Á á
92
+ Å å
93
+ Æ æ # ae ligature
163
94
 
164
- 'Ç'=>'ç',
165
- 'Č'=>'č',
95
+ Ç ç
96
+ Č č
166
97
 
167
- 'É'=>'é',
98
+ É é
168
99
 
169
- 'İ'=>'?', ## fix - add lowercase
170
- 'Í'=>'í',
100
+ İ i
101
+ Í í
171
102
 
172
- 'Ł'=>'ł',
103
+ Ł ł
173
104
 
174
- 'Ö'=>'ö',
105
+ Ö ö
106
+ Œ œ # oe ligature
175
107
 
176
- 'Ś'=>'?', ## fix - add lowercase
177
- 'Ş'=>'ş',
178
- 'Š'=>'š',
108
+ Ś ś
109
+ Ş ş
110
+ Š š
179
111
 
180
- 'Ü'=>'ü',
181
- 'Ú'=>'ú',
112
+ Ü ü
113
+ Ú ú
182
114
 
183
- 'Ž'=>'ž',
184
- )
115
+ Ž ž
116
+ TXT
185
117
 
186
118
  end # class Alphabet
@@ -0,0 +1,62 @@
1
+
2
+ class Alphabet
3
+ class Reader ## todo/check: rename to CharReader or something - why? why not?
4
+
5
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
6
+ txt = File.open( path, 'r:utf-8' ).read
7
+ parse( txt )
8
+ end
9
+
10
+ def self.parse( txt )
11
+ h = {} ## char(acter) table mappings
12
+
13
+ txt.each_line do |line|
14
+ line = line.strip
15
+
16
+ next if line.empty?
17
+ next if line.start_with?( '#' ) ## skip comments too
18
+
19
+ ## strip inline (until end-of-line) comments too
20
+ ## e.g ţ t ## U+0163
21
+ ## => ţ t
22
+ line = line.sub( /#.*/, '' ).strip
23
+ ## pp line
24
+
25
+ values = line.split( /[ \t]+/ )
26
+ ## pp values
27
+
28
+ ## check - must be a even - a multiple of two
29
+ if values.size % 2 != 0
30
+ puts "** !!! ERROR !!! - missing mapping pair - mappings must be even (a multiple of two):"
31
+ pp values
32
+ exit 1
33
+ end
34
+
35
+ # add mappings in pairs
36
+ values.each_slice(2) do |slice|
37
+ ## pp slice
38
+ key = slice[0]
39
+ value = slice[1]
40
+
41
+ ## check - key must be a single-character/letter in unicode
42
+ if key.size != 1
43
+ puts "** !!! ERROR !!! - mapping character must be a single-character, size is #{key.size}"
44
+ pp slice
45
+ exit 1
46
+ end
47
+
48
+ ## check - check for duplicates
49
+ if h[ key ]
50
+ puts "** !!! ERROR !!! - duplicate mapping character; key already present"
51
+ pp slice
52
+ exit 1
53
+ else
54
+ h[ key ] = value
55
+ end
56
+ end
57
+ end
58
+ h
59
+ end # method parse
60
+
61
+ end # class Reader
62
+ end # class Alphabet
@@ -0,0 +1,75 @@
1
+
2
+ class Alphabet
3
+
4
+ def self.frequency_table( name ) ## todo/check: use/rename to char_frequency_table
5
+ ## calculate the frequency table of letters, digits, etc.
6
+ freq = Hash.new(0)
7
+ name.each_char do |ch|
8
+ freq[ch] += 1
9
+ end
10
+ freq
11
+ end
12
+
13
+
14
+ def self.count( freq, mapping_or_chars )
15
+ chars = if mapping_or_chars.is_a?( Hash )
16
+ mapping_or_chars.keys
17
+ else ## todo/fix: check for is_a? Array and if is String split into Array (on char at a time?) - why? why not?
18
+ mapping_or_chars ## assume it's an array/list of characters
19
+ end
20
+
21
+ chars.reduce(0) do |count,ch|
22
+ count += freq[ch]
23
+ count
24
+ end
25
+ end
26
+
27
+
28
+ def self.sub( name, mapping ) ## todo/check: use a different/better name - gsub/map/replace/fold/... - why? why not?
29
+ buf = String.new
30
+ name.each_char do |ch|
31
+ buf << if mapping[ch]
32
+ mapping[ch]
33
+ else
34
+ ch
35
+ end
36
+ end
37
+ buf
38
+ end
39
+
40
+
41
+ class Unaccenter #Worker ## todo/change - find a better name - why? why not?
42
+ def initialize( mapping )
43
+ @mapping = mapping
44
+ end
45
+
46
+ def count( freq ) Alphabet.count( freq, @mapping ); end
47
+ def unaccent( name ) Alphabet.sub( name, @mapping ); end
48
+ end # class Unaccent Worker
49
+
50
+
51
+ def self.find_unaccenter( key )
52
+ if key == :de
53
+ @de ||= Unaccenter.new( UNACCENT_DE )
54
+ @de
55
+ else
56
+ ## use uni(versal) or unicode or something - why? why not?
57
+ ## use all or int'l (international) - why? why not?
58
+ ## use en (english) - why? why not?
59
+ @default ||= Unaccenter.new( UNACCENT )
60
+ @default
61
+ end
62
+ end
63
+
64
+ def self.unaccent( name )
65
+ @default ||= Unaccenter.new( UNACCENT )
66
+ @default.unaccent( name )
67
+ end
68
+
69
+
70
+ def self.downcase_i18n( name ) ## our very own downcase for int'l characters / letters
71
+ sub( name, DOWNCASE )
72
+ end
73
+ ## add downcase_uni - univeral/unicode - why? why not?
74
+
75
+ end # class Alphabet
@@ -5,8 +5,8 @@
5
5
 
6
6
  class Alphabet
7
7
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
8
- MINOR = 0
9
- PATCH = 1
8
+ MINOR = 1
9
+ PATCH = 0
10
10
  VERSION = [MAJOR,MINOR,PATCH].join('.')
11
11
 
12
12
  def self.version
@@ -0,0 +1,37 @@
1
+ ###
2
+ # to run use
3
+ # ruby -I ./lib -I ./test test/test_reader.rb
4
+
5
+
6
+ require 'helper'
7
+
8
+ class TestReader < MiniTest::Test
9
+
10
+ def test_parse
11
+ h = Alphabet::Reader.parse( <<TXT )
12
+ ## hello
13
+
14
+ Ä A ä a ## hello
15
+ Á A á a
16
+ à a
17
+ ã a
18
+ â a ### yada yada
19
+ Å A å a
20
+ æ ae
21
+
22
+ Ç C ç c
23
+ ć c
24
+
25
+ ß ss
26
+ TXT
27
+
28
+ pp h
29
+
30
+ assert_equal 'A', h['Ä']
31
+ assert_equal 'a', h['ä']
32
+ assert_equal 'ae', h['æ']
33
+
34
+ assert_equal 'ss', h['ß']
35
+ end
36
+
37
+ end # class TestReader
metadata CHANGED
@@ -1,43 +1,49 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: alphabets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-13 00:00:00.000000000 Z
11
+ date: 2019-08-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdoc
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '4.0'
20
+ - - "<"
21
+ - !ruby/object:Gem::Version
22
+ version: '7'
20
23
  type: :development
21
24
  prerelease: false
22
25
  version_requirements: !ruby/object:Gem::Requirement
23
26
  requirements:
24
- - - "~>"
27
+ - - ">="
25
28
  - !ruby/object:Gem::Version
26
29
  version: '4.0'
30
+ - - "<"
31
+ - !ruby/object:Gem::Version
32
+ version: '7'
27
33
  - !ruby/object:Gem::Dependency
28
34
  name: hoe
29
35
  requirement: !ruby/object:Gem::Requirement
30
36
  requirements:
31
37
  - - "~>"
32
38
  - !ruby/object:Gem::Version
33
- version: '3.16'
39
+ version: '3.18'
34
40
  type: :development
35
41
  prerelease: false
36
42
  version_requirements: !ruby/object:Gem::Requirement
37
43
  requirements:
38
44
  - - "~>"
39
45
  - !ruby/object:Gem::Version
40
- version: '3.16'
46
+ version: '3.18'
41
47
  description: 'alphabets - '
42
48
  email: opensport@googlegroups.com
43
49
  executables: []
@@ -55,10 +61,13 @@ files:
55
61
  - Rakefile
56
62
  - lib/alphabets.rb
57
63
  - lib/alphabets/alphabets.rb
64
+ - lib/alphabets/reader.rb
65
+ - lib/alphabets/utils.rb
58
66
  - lib/alphabets/variants.rb
59
67
  - lib/alphabets/version.rb
60
68
  - test/helper.rb
61
69
  - test/test_downcase.rb
70
+ - test/test_reader.rb
62
71
  - test/test_unaccent.rb
63
72
  - test/test_variants.rb
64
73
  homepage: https://github.com/sportdb/sport.db