sportdb-config 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e017d063fd47d049d93b49763c2549bf87412172
4
- data.tar.gz: 84d75edb2bb440805ae792c07ac732342f7f6828
3
+ metadata.gz: 43d84070a1a63efe77220f0be88991956c6696a7
4
+ data.tar.gz: 4b3c0a3a728065b648f52083f32eba1ebaeab43f
5
5
  SHA512:
6
- metadata.gz: c0c746998dd8bf6da6628174dbcd684a60d6813a85968eed939f056f2fc44df5f035a2c1680902886091d33f26143060d1518e48fbb1ba9a2af9661189ffbbbb
7
- data.tar.gz: 1a1ec84bf35b66b891fd7162529db514cbf2975769e7f62deb203263a5b272a25df83527128119086a8134e349e18c57a4c0c3f39a56a5266ddc1f87da205244
6
+ metadata.gz: 2461e383c70749bfc398de2561a057ab087a1d366644b4cf20d11530c19858d70974e0852b96db71041a6a1f7ebdabb34492a5874785227beea0140911f48f77
7
+ data.tar.gz: 72e12603145d651a9a2b6266755d1838d962513be478d3f1dbcfb2025b649a8bf4adcea875cdb9bc9424e95cdcb8511c433493125d90f1e5ce1c7e49dfec9993
@@ -47,7 +47,7 @@ class Club
47
47
  ## check for duplicates
48
48
  def duplicates?
49
49
  names = [name] + alt_names + alt_names_auto
50
- names = names.map { |name| normalize( name ) }
50
+ names = names.map { |name| normalize( sanitize(name) ) }
51
51
 
52
52
  names.size != names.uniq.size
53
53
  end
@@ -57,7 +57,7 @@ class Club
57
57
 
58
58
  ## calculate (count) frequency and select if greater than one
59
59
  names.reduce( Hash.new ) do |h,name|
60
- norm = normalize( name )
60
+ norm = normalize( sanitize(name) )
61
61
  h[norm] ||= []
62
62
  h[norm] << name; h
63
63
  end.select { |norm,names| names.size > 1 }
@@ -83,26 +83,14 @@ class Club
83
83
 
84
84
  def self.has_year?( name ) name =~ YEAR_REGEX; end
85
85
 
86
- LANG_REGEX = /\[[a-z]{2}\]/
86
+ LANG_REGEX = /\[[a-z]{1,2}\]/ ## note also allow [a] or [d] or [e] - why? why not?
87
87
  def self.strip_lang( name )
88
88
  name.gsub( LANG_REGEX, '' ).strip
89
89
  end
90
90
 
91
91
  def self.has_lang?( name ) name =~ LANG_REGEX; end
92
92
 
93
- NORM_REGEX = /[.'º\-\/]/
94
- ## note: remove all dots (.), dash (-), ', º, /, etc.
95
- ## for norm(alizing) names
96
- def self.strip_norm( name )
97
- name.gsub( NORM_REGEX, '' )
98
- end
99
-
100
- def strip_year( name ) self.class.strip_year( name ); end
101
- def strip_lang( name ) self.class.strip_lang( name ); end
102
- def strip_norm( name ) self.class.strip_norm( name ); end
103
-
104
- private
105
- def sanitize( name )
93
+ def self.sanitize( name )
106
94
  ## check for year(s) e.g. (1887-1911), (-2013),
107
95
  ## (1946-2001,2013-) etc.
108
96
  name = strip_year( name )
@@ -111,18 +99,47 @@ private
111
99
  name
112
100
  end
113
101
 
114
- def normalize( name )
115
- name = sanitize( name )
102
+
103
+ NORM_REGEX = /[.'º\-\/]/
104
+ ## note: remove all dots (.), dash (-), ', º, /, etc.
105
+ ## for norm(alizing) names
106
+ def self.strip_norm( name )
107
+ name.gsub( NORM_REGEX, '' )
108
+ end
109
+
110
+ def self.normalize( name )
111
+ # note: do NOT call sanitize here (keep normalize "atomic" for reuse)
116
112
 
117
113
  ## remove all dots (.), dash (-), º, /, etc.
118
114
  name = strip_norm( name )
119
115
  name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
120
116
 
121
117
  ## todo/fix: use our own downcase - why? why not?
122
- name = name.downcase ## do NOT care about upper and lowercase for now
118
+ name = downcase_i18n( name ) ## do NOT care about upper and lowercase for now
123
119
  name
124
120
  end
125
121
 
122
+
123
+ def self.strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
124
+ ## note: strip disambiguationn qualifier from wikipedia page name if present
125
+ ## note: only remove year and foot... for now
126
+ ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
127
+ ## Willem II (football club) => Willem II
128
+ ##
129
+ ## e.g. do NOT strip others !! e.g.
130
+ ## América Futebol Clube (MG)
131
+ ## only add more "special" cases on demand (that, is) if we find more
132
+ name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
133
+ name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
134
+ name
135
+ end
136
+
137
+
138
+ private
139
+ ## private "shortcut" convenience helpers
140
+ def sanitize( name ) self.class.sanitize( name ); end
141
+ def normalize( name ) self.class.normalize( name ); end
142
+
126
143
  def variants( name ) Variant.find( name ); end
127
144
  end # class Club
128
145
 
@@ -48,7 +48,8 @@ class ClubIndex
48
48
  def strip_year( name ) Club.strip_year( name ); end
49
49
  def has_year?( name) Club.has_year?( name ); end
50
50
  def strip_lang( name ) Club.strip_lang( name ); end
51
- def strip_norm( name ) Club.strip_norm( name ); end
51
+ def strip_wiki( name ) Club.strip_wiki( name ); end
52
+ def normalize( name ) Club.normalize( name ); end
52
53
 
53
54
 
54
55
  def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
@@ -62,9 +63,7 @@ class ClubIndex
62
63
  ## e.g. do NOT strip others !! e.g.
63
64
  ## América Futebol Clube (MG)
64
65
  ## only add more "special" cases on demand (that, is) if we find more
65
- name = rec.name
66
- name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
67
- name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
66
+ name = strip_wiki( rec.name )
68
67
 
69
68
  m = match_by( name: name, country: rec.country )
70
69
  if m.nil?
@@ -136,6 +135,7 @@ class ClubIndex
136
135
 
137
136
  names.each_with_index do |name,i|
138
137
  ## check lang codes e.g. [en], [fr], etc.
138
+ ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
139
139
  name = strip_lang( name )
140
140
  norm = normalize( name )
141
141
  alt_recs = @clubs_by_name[ norm ]
@@ -205,18 +205,6 @@ class ClubIndex
205
205
  end
206
206
  end
207
207
  end
208
-
209
-
210
-
211
- private
212
- def normalize( name )
213
- name = strip_norm( name )
214
- name = name.gsub( ' ', '' ) # remove all spaces
215
-
216
- ## todo/fix: use our own downcase - why? why not?
217
- name = name.downcase ## do NOT care about upper and lowercase for now
218
- name
219
- end
220
208
  end # class ClubIndex
221
209
 
222
210
 
@@ -21,33 +21,72 @@ end
21
21
  ## "simple" translation
22
22
  ALPHA_SPECIALS = {
23
23
  'Ä'=>'A', 'ä'=>'a',
24
- 'á'=>'a',
24
+ 'Á'=>'A', 'á'=>'a',
25
25
  'à'=>'a',
26
26
  'ã'=>'a',
27
27
  'â'=>'a',
28
+ 'Å'=>'A', 'å'=>'a',
29
+ 'æ'=>'ae',
30
+ 'ā'=>'a',
31
+ 'ă'=>'a',
32
+ 'ą'=>'a',
28
33
 
29
- 'ç'=>'c',
34
+ 'Ç' =>'C', 'ç'=>'c',
35
+ 'ć'=>'c',
36
+ 'Č'=>'C', 'č'=>'c',
30
37
 
31
38
  'É'=>'E', 'é'=>'e',
32
39
  'è'=>'e',
33
40
  'ê'=>'e',
41
+ 'ë'=>'e',
42
+ 'ė'=>'e',
43
+ 'ę'=>'e',
34
44
 
35
- 'í'=>'i',
45
+ 'ğ'=>'g',
46
+
47
+ 'İ'=>'I',
48
+ 'Í'=>'I', 'í'=>'i',
36
49
  'î'=>'i',
50
+ 'ī'=>'i',
51
+ 'ı'=>'i',
52
+
53
+ 'Ł'=>'L', 'ł'=>'l',
37
54
 
38
55
  'ñ'=>'n',
56
+ 'ń'=>'n',
57
+ 'ň'=>'n',
39
58
 
40
59
  'Ö'=>'O', 'ö'=>'o',
41
60
  'ó'=>'o',
42
61
  'õ'=>'o',
43
62
  'ô'=>'o',
63
+ 'ø'=>'o',
64
+ 'ő'=>'o',
44
65
 
45
- 'Ü'=>'U', 'ü'=>'u',
46
- 'ú'=>'u',
66
+ 'ř'=>'r',
47
67
 
68
+ 'Ś'=>'S',
69
+ 'Ş'=>'S', 'ş'=>'s',
70
+ 'Š'=>'S', 'š'=>'s',
71
+ 'ș'=>'s', ## U+0219
48
72
  'ß'=>'ss',
73
+
74
+ 'ţ'=>'t', ## U+0163
75
+ 'ț'=>'t', ## U+021B
76
+ 'þ'=>'th',
77
+
78
+ 'Ü'=>'U', 'ü'=>'u',
79
+ 'Ú'=>'U', 'ú'=>'u',
80
+ 'ū'=>'u',
81
+
82
+ 'ý'=>'y',
83
+
84
+ 'ź'=>'z',
85
+ 'ż'=>'z',
86
+ 'Ž'=>'Z', 'ž'=>'z',
49
87
  }
50
88
 
89
+
51
90
  ## de,at,ch translation for umlauts
52
91
  ALPHA_SPECIALS_DE = {
53
92
  'Ä'=>'Ae', 'ä'=>'ae',
@@ -59,6 +98,35 @@ ALPHA_SPECIALS_DE = {
59
98
  ## add ALPHA_SPECIALS_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)?
60
99
  # 'ñ'=>'ny', ## e.g. Español => Espanyol
61
100
 
101
+ ALPHA_DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
102
+ h[ch] = ch.downcase
103
+ h
104
+ end.merge(
105
+ 'Ä'=>'ä',
106
+ 'Á'=>'á',
107
+ 'Å'=>'å',
108
+
109
+ 'Ç'=>'ç',
110
+ 'Č'=>'č',
111
+
112
+ 'É'=>'é',
113
+
114
+ 'İ'=>'?', ## fix - add lowercase
115
+ 'Í'=>'í',
116
+
117
+ 'Ł'=>'ł',
118
+
119
+ 'Ö'=>'ö',
120
+
121
+ 'Ś'=>'?', ## fix - add lowercase
122
+ 'Ş'=>'ş',
123
+ 'Š'=>'š',
124
+
125
+ 'Ü'=>'ü',
126
+ 'Ú'=>'ú',
127
+
128
+ 'Ž'=>'ž',
129
+ )
62
130
 
63
131
 
64
132
  def self.alpha_specials_count( freq, mapping )
@@ -99,8 +167,19 @@ def self.find( name )
99
167
  alt_names = alt_names.uniq
100
168
  alt_names
101
169
  end
102
- end # Variant
103
170
 
171
+ def self.downcase_i18n( name ) ## our very own downcase for int'l characters / letters
172
+ tr( name, ALPHA_DOWNCASE )
173
+ end
174
+
175
+ end # class Variant
104
176
 
105
177
  end ## module Import
106
178
  end ## module SportDb
179
+
180
+
181
+
182
+ ## "global" convenience helper
183
+ def downcase_i18n( name )
184
+ SportDb::Import::Variant.downcase_i18n( name )
185
+ end # Variant
@@ -8,7 +8,7 @@ module Boot ## note: use a different module than Config to avoid confusion
8
8
 
9
9
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
10
10
  MINOR = 5
11
- PATCH = 2
11
+ PATCH = 3
12
12
  VERSION = [MAJOR,MINOR,PATCH].join('.')
13
13
 
14
14
  def self.version
@@ -17,6 +17,15 @@ class WikiIndex
17
17
  self.new( recs )
18
18
  end
19
19
 
20
+
21
+
22
+ ## helpers from club - use a helper module for includes - why? why not?
23
+ def strip_lang( name ) Club.strip_lang( name ); end
24
+ def strip_year( name ) Club.strip_year( name ); end
25
+ def normalize( name ) Club.normalize( name ); end
26
+ def strip_wiki( name) Club.strip_wiki( name ); end
27
+
28
+
20
29
  def initialize( recs )
21
30
  @pages_by_country = {}
22
31
 
@@ -24,21 +33,11 @@ class WikiIndex
24
33
  ## check for duplicate recs - report and exit on dupliate!!!!!!
25
34
  recs.each do |rec|
26
35
  h = @pages_by_country[ rec.country.key ] ||= {}
27
- h[ normalize(rec.name) ] = rec
36
+ h[ normalize( strip_wiki( rec.name )) ] = rec
28
37
  end
29
38
  end
30
39
 
31
40
 
32
- def normalize( name )
33
- ## todo/fix: (re)use normalize from Club!!!!
34
- name = name.gsub( /[\-\.]/, '' )
35
- name = name.gsub( ' ', '' ) ## remove spaces too
36
- name = name.downcase
37
- name
38
- end
39
-
40
-
41
-
42
41
  def find_by( club: ) ## todo/check: use find_by_club - why? why not?
43
42
  find_by_club( club )
44
43
  end
@@ -55,8 +54,8 @@ class WikiIndex
55
54
  ## todo/check: sort names ?
56
55
  ## sort by longest first (for best match)
57
56
  names.each do |name|
58
- ## todo/fix: name - remove/string year and lang e.g. (1946-2001), [en]!!!!
59
- rec = h[ normalize( name ) ]
57
+ ## note: normalize AND sanitize (e.g. remove/string year and lang e.g. (1946-2001), [en] too)
58
+ rec = h[ normalize( strip_year( strip_lang( name ))) ]
60
59
  break if rec ## bingo!! found - break on first match
61
60
  end
62
61
  end
@@ -12,9 +12,10 @@ class TestVariants < MiniTest::Test
12
12
  def variants( name ) SportDb::Import::Variant.find( name ); end
13
13
 
14
14
 
15
- def test_downcase
16
- ## todo/fix: use our own downcase for normalize - why? why not?
17
- ## assert_equal 'äöü', 'ÄÖÜ'.downcase
15
+ def test_downcase_i18n
16
+ assert_equal 'abcdefghijklmnopqrstuvwxyz', downcase_i18n( 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' )
17
+ assert_equal 'äöü', downcase_i18n( 'ÄÖÜ' )
18
+ assert_equal 'köln', downcase_i18n( 'KÖLN' )
18
19
  end
19
20
 
20
21
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-config
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-05 00:00:00.000000000 Z
11
+ date: 2019-08-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csvreader