sportdb-config 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e017d063fd47d049d93b49763c2549bf87412172
4
- data.tar.gz: 84d75edb2bb440805ae792c07ac732342f7f6828
3
+ metadata.gz: 43d84070a1a63efe77220f0be88991956c6696a7
4
+ data.tar.gz: 4b3c0a3a728065b648f52083f32eba1ebaeab43f
5
5
  SHA512:
6
- metadata.gz: c0c746998dd8bf6da6628174dbcd684a60d6813a85968eed939f056f2fc44df5f035a2c1680902886091d33f26143060d1518e48fbb1ba9a2af9661189ffbbbb
7
- data.tar.gz: 1a1ec84bf35b66b891fd7162529db514cbf2975769e7f62deb203263a5b272a25df83527128119086a8134e349e18c57a4c0c3f39a56a5266ddc1f87da205244
6
+ metadata.gz: 2461e383c70749bfc398de2561a057ab087a1d366644b4cf20d11530c19858d70974e0852b96db71041a6a1f7ebdabb34492a5874785227beea0140911f48f77
7
+ data.tar.gz: 72e12603145d651a9a2b6266755d1838d962513be478d3f1dbcfb2025b649a8bf4adcea875cdb9bc9424e95cdcb8511c433493125d90f1e5ce1c7e49dfec9993
@@ -47,7 +47,7 @@ class Club
47
47
  ## check for duplicates
48
48
  def duplicates?
49
49
  names = [name] + alt_names + alt_names_auto
50
- names = names.map { |name| normalize( name ) }
50
+ names = names.map { |name| normalize( sanitize(name) ) }
51
51
 
52
52
  names.size != names.uniq.size
53
53
  end
@@ -57,7 +57,7 @@ class Club
57
57
 
58
58
  ## calculate (count) frequency and select if greater than one
59
59
  names.reduce( Hash.new ) do |h,name|
60
- norm = normalize( name )
60
+ norm = normalize( sanitize(name) )
61
61
  h[norm] ||= []
62
62
  h[norm] << name; h
63
63
  end.select { |norm,names| names.size > 1 }
@@ -83,26 +83,14 @@ class Club
83
83
 
84
84
  def self.has_year?( name ) name =~ YEAR_REGEX; end
85
85
 
86
- LANG_REGEX = /\[[a-z]{2}\]/
86
+ LANG_REGEX = /\[[a-z]{1,2}\]/ ## note also allow [a] or [d] or [e] - why? why not?
87
87
  def self.strip_lang( name )
88
88
  name.gsub( LANG_REGEX, '' ).strip
89
89
  end
90
90
 
91
91
  def self.has_lang?( name ) name =~ LANG_REGEX; end
92
92
 
93
- NORM_REGEX = /[.'º\-\/]/
94
- ## note: remove all dots (.), dash (-), ', º, /, etc.
95
- ## for norm(alizing) names
96
- def self.strip_norm( name )
97
- name.gsub( NORM_REGEX, '' )
98
- end
99
-
100
- def strip_year( name ) self.class.strip_year( name ); end
101
- def strip_lang( name ) self.class.strip_lang( name ); end
102
- def strip_norm( name ) self.class.strip_norm( name ); end
103
-
104
- private
105
- def sanitize( name )
93
+ def self.sanitize( name )
106
94
  ## check for year(s) e.g. (1887-1911), (-2013),
107
95
  ## (1946-2001,2013-) etc.
108
96
  name = strip_year( name )
@@ -111,18 +99,47 @@ private
111
99
  name
112
100
  end
113
101
 
114
- def normalize( name )
115
- name = sanitize( name )
102
+
103
+ NORM_REGEX = /[.'º\-\/]/
104
+ ## note: remove all dots (.), dash (-), ', º, /, etc.
105
+ ## for norm(alizing) names
106
+ def self.strip_norm( name )
107
+ name.gsub( NORM_REGEX, '' )
108
+ end
109
+
110
+ def self.normalize( name )
111
+ # note: do NOT call sanitize here (keep normalize "atomic" for reuse)
116
112
 
117
113
  ## remove all dots (.), dash (-), º, /, etc.
118
114
  name = strip_norm( name )
119
115
  name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
120
116
 
121
117
  ## todo/fix: use our own downcase - why? why not?
122
- name = name.downcase ## do NOT care about upper and lowercase for now
118
+ name = downcase_i18n( name ) ## do NOT care about upper and lowercase for now
123
119
  name
124
120
  end
125
121
 
122
+
123
+ def self.strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
124
+ ## note: strip disambiguationn qualifier from wikipedia page name if present
125
+ ## note: only remove year and foot... for now
126
+ ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
127
+ ## Willem II (football club) => Willem II
128
+ ##
129
+ ## e.g. do NOT strip others !! e.g.
130
+ ## América Futebol Clube (MG)
131
+ ## only add more "special" cases on demand (that, is) if we find more
132
+ name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
133
+ name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
134
+ name
135
+ end
136
+
137
+
138
+ private
139
+ ## private "shortcut" convenience helpers
140
+ def sanitize( name ) self.class.sanitize( name ); end
141
+ def normalize( name ) self.class.normalize( name ); end
142
+
126
143
  def variants( name ) Variant.find( name ); end
127
144
  end # class Club
128
145
 
@@ -48,7 +48,8 @@ class ClubIndex
48
48
  def strip_year( name ) Club.strip_year( name ); end
49
49
  def has_year?( name) Club.has_year?( name ); end
50
50
  def strip_lang( name ) Club.strip_lang( name ); end
51
- def strip_norm( name ) Club.strip_norm( name ); end
51
+ def strip_wiki( name ) Club.strip_wiki( name ); end
52
+ def normalize( name ) Club.normalize( name ); end
52
53
 
53
54
 
54
55
  def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
@@ -62,9 +63,7 @@ class ClubIndex
62
63
  ## e.g. do NOT strip others !! e.g.
63
64
  ## América Futebol Clube (MG)
64
65
  ## only add more "special" cases on demand (that, is) if we find more
65
- name = rec.name
66
- name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
67
- name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
66
+ name = strip_wiki( rec.name )
68
67
 
69
68
  m = match_by( name: name, country: rec.country )
70
69
  if m.nil?
@@ -136,6 +135,7 @@ class ClubIndex
136
135
 
137
136
  names.each_with_index do |name,i|
138
137
  ## check lang codes e.g. [en], [fr], etc.
138
+ ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
139
139
  name = strip_lang( name )
140
140
  norm = normalize( name )
141
141
  alt_recs = @clubs_by_name[ norm ]
@@ -205,18 +205,6 @@ class ClubIndex
205
205
  end
206
206
  end
207
207
  end
208
-
209
-
210
-
211
- private
212
- def normalize( name )
213
- name = strip_norm( name )
214
- name = name.gsub( ' ', '' ) # remove all spaces
215
-
216
- ## todo/fix: use our own downcase - why? why not?
217
- name = name.downcase ## do NOT care about upper and lowercase for now
218
- name
219
- end
220
208
  end # class ClubIndex
221
209
 
222
210
 
@@ -21,33 +21,72 @@ end
21
21
  ## "simple" translation
22
22
  ALPHA_SPECIALS = {
23
23
  'Ä'=>'A', 'ä'=>'a',
24
- 'á'=>'a',
24
+ 'Á'=>'A', 'á'=>'a',
25
25
  'à'=>'a',
26
26
  'ã'=>'a',
27
27
  'â'=>'a',
28
+ 'Å'=>'A', 'å'=>'a',
29
+ 'æ'=>'ae',
30
+ 'ā'=>'a',
31
+ 'ă'=>'a',
32
+ 'ą'=>'a',
28
33
 
29
- 'ç'=>'c',
34
+ 'Ç' =>'C', 'ç'=>'c',
35
+ 'ć'=>'c',
36
+ 'Č'=>'C', 'č'=>'c',
30
37
 
31
38
  'É'=>'E', 'é'=>'e',
32
39
  'è'=>'e',
33
40
  'ê'=>'e',
41
+ 'ë'=>'e',
42
+ 'ė'=>'e',
43
+ 'ę'=>'e',
34
44
 
35
- 'í'=>'i',
45
+ 'ğ'=>'g',
46
+
47
+ 'İ'=>'I',
48
+ 'Í'=>'I', 'í'=>'i',
36
49
  'î'=>'i',
50
+ 'ī'=>'i',
51
+ 'ı'=>'i',
52
+
53
+ 'Ł'=>'L', 'ł'=>'l',
37
54
 
38
55
  'ñ'=>'n',
56
+ 'ń'=>'n',
57
+ 'ň'=>'n',
39
58
 
40
59
  'Ö'=>'O', 'ö'=>'o',
41
60
  'ó'=>'o',
42
61
  'õ'=>'o',
43
62
  'ô'=>'o',
63
+ 'ø'=>'o',
64
+ 'ő'=>'o',
44
65
 
45
- 'Ü'=>'U', 'ü'=>'u',
46
- 'ú'=>'u',
66
+ 'ř'=>'r',
47
67
 
68
+ 'Ś'=>'S',
69
+ 'Ş'=>'S', 'ş'=>'s',
70
+ 'Š'=>'S', 'š'=>'s',
71
+ 'ș'=>'s', ## U+0219
48
72
  'ß'=>'ss',
73
+
74
+ 'ţ'=>'t', ## U+0163
75
+ 'ț'=>'t', ## U+021B
76
+ 'þ'=>'th',
77
+
78
+ 'Ü'=>'U', 'ü'=>'u',
79
+ 'Ú'=>'U', 'ú'=>'u',
80
+ 'ū'=>'u',
81
+
82
+ 'ý'=>'y',
83
+
84
+ 'ź'=>'z',
85
+ 'ż'=>'z',
86
+ 'Ž'=>'Z', 'ž'=>'z',
49
87
  }
50
88
 
89
+
51
90
  ## de,at,ch translation for umlauts
52
91
  ALPHA_SPECIALS_DE = {
53
92
  'Ä'=>'Ae', 'ä'=>'ae',
@@ -59,6 +98,35 @@ ALPHA_SPECIALS_DE = {
59
98
  ## add ALPHA_SPECIALS_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)?
60
99
  # 'ñ'=>'ny', ## e.g. Español => Espanyol
61
100
 
101
+ ALPHA_DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
102
+ h[ch] = ch.downcase
103
+ h
104
+ end.merge(
105
+ 'Ä'=>'ä',
106
+ 'Á'=>'á',
107
+ 'Å'=>'å',
108
+
109
+ 'Ç'=>'ç',
110
+ 'Č'=>'č',
111
+
112
+ 'É'=>'é',
113
+
114
+ 'İ'=>'?', ## fix - add lowercase
115
+ 'Í'=>'í',
116
+
117
+ 'Ł'=>'ł',
118
+
119
+ 'Ö'=>'ö',
120
+
121
+ 'Ś'=>'?', ## fix - add lowercase
122
+ 'Ş'=>'ş',
123
+ 'Š'=>'š',
124
+
125
+ 'Ü'=>'ü',
126
+ 'Ú'=>'ú',
127
+
128
+ 'Ž'=>'ž',
129
+ )
62
130
 
63
131
 
64
132
  def self.alpha_specials_count( freq, mapping )
@@ -99,8 +167,19 @@ def self.find( name )
99
167
  alt_names = alt_names.uniq
100
168
  alt_names
101
169
  end
102
- end # Variant
103
170
 
171
+ def self.downcase_i18n( name ) ## our very own downcase for int'l characters / letters
172
+ tr( name, ALPHA_DOWNCASE )
173
+ end
174
+
175
+ end # class Variant
104
176
 
105
177
  end ## module Import
106
178
  end ## module SportDb
179
+
180
+
181
+
182
+ ## "global" convenience helper
183
+ def downcase_i18n( name )
184
+ SportDb::Import::Variant.downcase_i18n( name )
185
+ end # Variant
@@ -8,7 +8,7 @@ module Boot ## note: use a different module than Config to avoid confusion
8
8
 
9
9
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
10
10
  MINOR = 5
11
- PATCH = 2
11
+ PATCH = 3
12
12
  VERSION = [MAJOR,MINOR,PATCH].join('.')
13
13
 
14
14
  def self.version
@@ -17,6 +17,15 @@ class WikiIndex
17
17
  self.new( recs )
18
18
  end
19
19
 
20
+
21
+
22
+ ## helpers from club - use a helper module for includes - why? why not?
23
+ def strip_lang( name ) Club.strip_lang( name ); end
24
+ def strip_year( name ) Club.strip_year( name ); end
25
+ def normalize( name ) Club.normalize( name ); end
26
+ def strip_wiki( name) Club.strip_wiki( name ); end
27
+
28
+
20
29
  def initialize( recs )
21
30
  @pages_by_country = {}
22
31
 
@@ -24,21 +33,11 @@ class WikiIndex
24
33
  ## check for duplicate recs - report and exit on dupliate!!!!!!
25
34
  recs.each do |rec|
26
35
  h = @pages_by_country[ rec.country.key ] ||= {}
27
- h[ normalize(rec.name) ] = rec
36
+ h[ normalize( strip_wiki( rec.name )) ] = rec
28
37
  end
29
38
  end
30
39
 
31
40
 
32
- def normalize( name )
33
- ## todo/fix: (re)use normalize from Club!!!!
34
- name = name.gsub( /[\-\.]/, '' )
35
- name = name.gsub( ' ', '' ) ## remove spaces too
36
- name = name.downcase
37
- name
38
- end
39
-
40
-
41
-
42
41
  def find_by( club: ) ## todo/check: use find_by_club - why? why not?
43
42
  find_by_club( club )
44
43
  end
@@ -55,8 +54,8 @@ class WikiIndex
55
54
  ## todo/check: sort names ?
56
55
  ## sort by longest first (for best match)
57
56
  names.each do |name|
58
- ## todo/fix: name - remove/string year and lang e.g. (1946-2001), [en]!!!!
59
- rec = h[ normalize( name ) ]
57
+ ## note: normalize AND sanitize (e.g. remove/string year and lang e.g. (1946-2001), [en] too)
58
+ rec = h[ normalize( strip_year( strip_lang( name ))) ]
60
59
  break if rec ## bingo!! found - break on first match
61
60
  end
62
61
  end
@@ -12,9 +12,10 @@ class TestVariants < MiniTest::Test
12
12
  def variants( name ) SportDb::Import::Variant.find( name ); end
13
13
 
14
14
 
15
- def test_downcase
16
- ## todo/fix: use our own downcase for normalize - why? why not?
17
- ## assert_equal 'äöü', 'ÄÖÜ'.downcase
15
+ def test_downcase_i18n
16
+ assert_equal 'abcdefghijklmnopqrstuvwxyz', downcase_i18n( 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' )
17
+ assert_equal 'äöü', downcase_i18n( 'ÄÖÜ' )
18
+ assert_equal 'köln', downcase_i18n( 'KÖLN' )
18
19
  end
19
20
 
20
21
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-config
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-05 00:00:00.000000000 Z
11
+ date: 2019-08-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csvreader