sportdb-config 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,278 +1,278 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubReader
9
-
10
-
11
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ).read
13
- parse( txt )
14
- end
15
-
16
-
17
- def self.parse( txt )
18
- recs = []
19
- last_rec = nil
20
- headings = [] ## headings stack
21
-
22
- txt.each_line do |line|
23
- line = line.strip
24
-
25
- next if line.empty?
26
- next if line.start_with?( '#' ) ## skip comments too
27
-
28
- ## strip inline (until end-of-line) comments too
29
- ## e.g Eupen => KAS Eupen, ## [de]
30
- ## => Eupen => KAS Eupen,
31
- line = line.sub( /#.*/, '' ).strip
32
- pp line
33
-
34
-
35
- next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
36
-
37
- ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
38
- ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
39
- if line =~ /^(={1,}) ## leading ======
40
- ([^=]+?) ## text (note: for now no "inline" = allowed)
41
- =* ## (optional) trailing ====
42
- $/x
43
- heading_marker = $1
44
- heading_level = $1.length ## count number of = for heading level
45
- heading = $2.strip
46
-
47
- puts "heading #{heading_level} >#{heading}<"
48
-
49
- ## 1) first pop headings if present
50
- while headings.size+1 > heading_level
51
- headings.pop
52
- end
53
-
54
- ## 2) add missing (hierarchy) level if
55
- while headings.size+1 < heading_level
56
- ## todo/fix: issue warning about "skipping" hierarchy level
57
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
58
- headings.push( nil )
59
- end
60
-
61
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
62
- ## keep level empty
63
- else
64
-
65
- ## quick hack: if level is 1 assume country for now
66
- ## and extract country code e.g.
67
- ## Austria (at) => at
68
- ## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
69
- if heading_level == 1
70
- if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
71
- country_code = $1
72
-
73
- ## check country code - MUST exist for now!!!!
74
- country = SportDb::Import.config.countries[ country_code ]
75
- if country.nil?
76
- puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
77
- exit 1
78
- end
79
-
80
- headings.push( country_code )
81
- else
82
- puts "!!! error - heading level 1 - missing country code - >#{heading}<"
83
- exit 1
84
- end
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif line.start_with?( '|' )
108
- ## assume continuation with line of alternative names
109
- ## note: skip leading pipe
110
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
111
- ## strip and squish (white)spaces
112
- # e.g. New York FC (2011-) => New York FC (2011-)
113
- values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
114
- last_rec.alt_names += values
115
- last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
116
-
117
- ## check for duplicates
118
- if last_rec.duplicates?
119
- duplicates = last_rec.duplicates
120
- puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
121
- pp duplicates
122
- pp last_rec
123
- ##
124
- ## todo/fix: make it only an error with exit 1
125
- ## if (not normalized) names are the same (not unique/uniq)
126
- ## e.g. don't exit on A.F.C. == AFC etc.
127
- ## exit 1
128
- end
129
- else
130
- values = line.split( ',' )
131
-
132
- rec = Club.new
133
- value = values.shift ## get first item
134
- ## strip and squish (white)spaces
135
- # e.g. New York FC (2011-) => New York FC (2011-)
136
- value = value.strip.gsub( /[ \t]+/, ' ' )
137
- rec.name = value # canoncial name (global unique "beautiful/long" name)
138
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
139
-
140
- ## note:
141
- ## check/todo!!!!!!!!!!!!!!!!!-
142
- ## strip year if to present e.g. (2011-)
143
- ##
144
- ## do NOT strip for defunct / historic clubs e.g.
145
- ## (1899-1910)
146
- ## or (-1914) or (-2011) etc.
147
-
148
- ###
149
- ## todo: move year out of canonical team name - why? why not?
150
-
151
- ## check if canonical name include (2011-) or similar in name
152
- ## if yes, remove (2011-) and add to (alt) names
153
- ## e.g. New York FC (2011) => New York FC
154
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
155
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
156
-
157
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
158
- rec.year = $1.to_i
159
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
160
- rec.year_end = $1.to_i
161
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
162
- rec.year = $1.to_i
163
- rec.year_end = $2.to_i
164
- else
165
- ## todo/check: warn about unknown year format
166
- end
167
- end
168
-
169
- ## todo/check - check for unknown format values
170
- ## e.g. too many values, duplicate years, etc.
171
- ## check for overwritting, etc.
172
- while values.size > 0
173
- value = values.shift
174
- ## strip and squish (white)spaces
175
- # e.g. León › Guanajuato => León › Guanajuato
176
- value = value.strip.gsub( /[ \t]+/, ' ' )
177
- if value =~/^\d{4}$/ # e.g 1904
178
- ## todo/check: issue warning if year is already set!!!!!!!
179
- if rec.year
180
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
181
- pp rec
182
- exit 1
183
- end
184
- rec.year = value.to_i
185
- elsif value.start_with?( '@' ) # e.g. @ Anfield
186
- ## cut-off leading @ and spaces
187
- rec.ground = value[1..-1].strip
188
- else
189
- ## assume city / geo tree
190
- ## split into geo tree
191
- geos = split_geo( value )
192
- city = geos[0]
193
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
194
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
195
- rec.district = $1.strip
196
- city = city.gsub( /\(.+?\)/, '' ).strip
197
- end
198
- rec.city = city
199
-
200
- if geos.size > 1
201
- ## cut-off city and keep the rest (of geo tree)
202
- rec.geos = geos[1..-1]
203
- end
204
- end
205
- end ## while values
206
-
207
-
208
- ###############
209
- ## use headings text for geo tree
210
-
211
- ## 1) add country if present
212
- if headings.size > 0 && headings[0]
213
- country = SportDb::Import.config.countries[ headings[0] ]
214
- rec.country = country
215
- else
216
- ## make it an error - why? why not?
217
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
218
- exit 1
219
- end
220
-
221
- ## 2) check geo tree with headings hierarchy
222
- if headings.size > 1 && headings[1]
223
- geos = split_geo( headings[1] )
224
- if rec.geos
225
- if rec.geos[0] != geos[0]
226
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
227
- exit 1
228
- end
229
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
231
- exit 1
232
- end
233
- else
234
- ## add missing region (state/province) from headings hierarchy
235
- rec.geos = geos
236
- end
237
- end
238
-
239
- last_rec = rec
240
-
241
-
242
- ### todo/fix:
243
- ## auto-add alt name with dots stripped - why? why not?
244
- ## e.g. D.C. United => DC United
245
- ## e.g. Liverpool F.C. => Liverpool FC
246
- ## e.g. St. Albin => St Albin etc.
247
- ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
248
-
249
- ##
250
- ## todo/fix: unify mapping entries
251
- ## always lowercase !!!! (case insensitive)
252
- ## always strip (2011-) !!!
253
- ## always strip dots (e.g. St., F.C, etc.)
254
-
255
- recs << rec
256
- end
257
- end # each_line
258
- recs
259
- end # method read
260
-
261
- ### helpers
262
- def self.split_geo( str )
263
- ## assume city / geo tree
264
- ## strip and squish (white)spaces
265
- # e.g. León › Guanajuato => León › Guanajuato
266
- str = str.strip.gsub( /[ \t]+/, ' ' )
267
-
268
- ## split into geo tree
269
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
270
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
271
- geos
272
- end
273
-
274
- end # class ClubReader
275
-
276
-
277
- end ## module Import
278
- end ## module SportDb
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubReader
9
+
10
+
11
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
+ txt = File.open( path, 'r:utf-8' ).read
13
+ parse( txt )
14
+ end
15
+
16
+
17
+ def self.parse( txt )
18
+ recs = []
19
+ last_rec = nil
20
+ headings = [] ## headings stack
21
+
22
+ txt.each_line do |line|
23
+ line = line.strip
24
+
25
+ next if line.empty?
26
+ next if line.start_with?( '#' ) ## skip comments too
27
+
28
+ ## strip inline (until end-of-line) comments too
29
+ ## e.g Eupen => KAS Eupen, ## [de]
30
+ ## => Eupen => KAS Eupen,
31
+ line = line.sub( /#.*/, '' ).strip
32
+ pp line
33
+
34
+
35
+ next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
36
+
37
+ ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
38
+ ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
39
+ if line =~ /^(={1,}) ## leading ======
40
+ ([^=]+?) ## text (note: for now no "inline" = allowed)
41
+ =* ## (optional) trailing ====
42
+ $/x
43
+ heading_marker = $1
44
+ heading_level = $1.length ## count number of = for heading level
45
+ heading = $2.strip
46
+
47
+ puts "heading #{heading_level} >#{heading}<"
48
+
49
+ ## 1) first pop headings if present
50
+ while headings.size+1 > heading_level
51
+ headings.pop
52
+ end
53
+
54
+ ## 2) add missing (hierarchy) level if
55
+ while headings.size+1 < heading_level
56
+ ## todo/fix: issue warning about "skipping" hierarchy level
57
+ puts "!!! warn [team reader] - skipping hierarchy level in headings "
58
+ headings.push( nil )
59
+ end
60
+
61
+ if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
62
+ ## keep level empty
63
+ else
64
+
65
+ ## quick hack: if level is 1 assume country for now
66
+ ## and extract country code e.g.
67
+ ## Austria (at) => at
68
+ ## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
69
+ if heading_level == 1
70
+ if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
71
+ country_code = $1
72
+
73
+ ## check country code - MUST exist for now!!!!
74
+ country = SportDb::Import.config.countries[ country_code ]
75
+ if country.nil?
76
+ puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
77
+ exit 1
78
+ end
79
+
80
+ headings.push( country_code )
81
+ else
82
+ puts "!!! error - heading level 1 - missing country code - >#{heading}<"
83
+ exit 1
84
+ end
85
+ else
86
+ ## quick hack:
87
+ ## remove known fill/dummy words incl:
88
+ ## Provincia San Juan => San Juan (see argentina, for example)
89
+ ##
90
+ ## use geo tree long term with alternative names - why? why not?
91
+ words = ['Provincia']
92
+ words.each { |word| heading = heading.gsub( word, '' ) }
93
+ heading = heading.strip
94
+
95
+ headings.push( heading )
96
+ end
97
+
98
+ ## assert that hierarchy level is ok
99
+ if headings.size != heading_level
100
+ puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
+ exit 1
102
+ end
103
+ end
104
+
105
+ pp headings
106
+
107
+ elsif line.start_with?( '|' )
108
+ ## assume continuation with line of alternative names
109
+ ## note: skip leading pipe
110
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
111
+ ## strip and squish (white)spaces
112
+ # e.g. New York FC (2011-) => New York FC (2011-)
113
+ values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
114
+ last_rec.alt_names += values
115
+ last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
116
+
117
+ ## check for duplicates
118
+ if last_rec.duplicates?
119
+ duplicates = last_rec.duplicates
120
+ puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
121
+ pp duplicates
122
+ pp last_rec
123
+ ##
124
+ ## todo/fix: make it only an error with exit 1
125
+ ## if (not normalized) names are the same (not unique/uniq)
126
+ ## e.g. don't exit on A.F.C. == AFC etc.
127
+ ## exit 1
128
+ end
129
+ else
130
+ values = line.split( ',' )
131
+
132
+ rec = Club.new
133
+ value = values.shift ## get first item
134
+ ## strip and squish (white)spaces
135
+ # e.g. New York FC (2011-) => New York FC (2011-)
136
+ value = value.strip.gsub( /[ \t]+/, ' ' )
137
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
138
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
139
+
140
+ ## note:
141
+ ## check/todo!!!!!!!!!!!!!!!!!-
142
+ ## strip year if to present e.g. (2011-)
143
+ ##
144
+ ## do NOT strip for defunct / historic clubs e.g.
145
+ ## (1899-1910)
146
+ ## or (-1914) or (-2011) etc.
147
+
148
+ ###
149
+ ## todo: move year out of canonical team name - why? why not?
150
+
151
+ ## check if canonical name include (2011-) or similar in name
152
+ ## if yes, remove (2011-) and add to (alt) names
153
+ ## e.g. New York FC (2011) => New York FC
154
+ if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
155
+ name = rec.name.gsub( /\(.+?\)/, '' ).strip
156
+
157
+ if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
158
+ rec.year = $1.to_i
159
+ elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
160
+ rec.year_end = $1.to_i
161
+ elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
162
+ rec.year = $1.to_i
163
+ rec.year_end = $2.to_i
164
+ else
165
+ ## todo/check: warn about unknown year format
166
+ end
167
+ end
168
+
169
+ ## todo/check - check for unknown format values
170
+ ## e.g. too many values, duplicate years, etc.
171
+ ## check for overwritting, etc.
172
+ while values.size > 0
173
+ value = values.shift
174
+ ## strip and squish (white)spaces
175
+ # e.g. León › Guanajuato => León › Guanajuato
176
+ value = value.strip.gsub( /[ \t]+/, ' ' )
177
+ if value =~/^\d{4}$/ # e.g 1904
178
+ ## todo/check: issue warning if year is already set!!!!!!!
179
+ if rec.year
180
+ puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
181
+ pp rec
182
+ exit 1
183
+ end
184
+ rec.year = value.to_i
185
+ elsif value.start_with?( '@' ) # e.g. @ Anfield
186
+ ## cut-off leading @ and spaces
187
+ rec.ground = value[1..-1].strip
188
+ else
189
+ ## assume city / geo tree
190
+ ## split into geo tree
191
+ geos = split_geo( value )
192
+ city = geos[0]
193
+ ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
194
+ if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
195
+ rec.district = $1.strip
196
+ city = city.gsub( /\(.+?\)/, '' ).strip
197
+ end
198
+ rec.city = city
199
+
200
+ if geos.size > 1
201
+ ## cut-off city and keep the rest (of geo tree)
202
+ rec.geos = geos[1..-1]
203
+ end
204
+ end
205
+ end ## while values
206
+
207
+
208
+ ###############
209
+ ## use headings text for geo tree
210
+
211
+ ## 1) add country if present
212
+ if headings.size > 0 && headings[0]
213
+ country = SportDb::Import.config.countries[ headings[0] ]
214
+ rec.country = country
215
+ else
216
+ ## make it an error - why? why not?
217
+ puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
218
+ exit 1
219
+ end
220
+
221
+ ## 2) check geo tree with headings hierarchy
222
+ if headings.size > 1 && headings[1]
223
+ geos = split_geo( headings[1] )
224
+ if rec.geos
225
+ if rec.geos[0] != geos[0]
226
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
227
+ exit 1
228
+ end
229
+ if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
230
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
231
+ exit 1
232
+ end
233
+ else
234
+ ## add missing region (state/province) from headings hierarchy
235
+ rec.geos = geos
236
+ end
237
+ end
238
+
239
+ last_rec = rec
240
+
241
+
242
+ ### todo/fix:
243
+ ## auto-add alt name with dots stripped - why? why not?
244
+ ## e.g. D.C. United => DC United
245
+ ## e.g. Liverpool F.C. => Liverpool FC
246
+ ## e.g. St. Albin => St Albin etc.
247
+ ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
248
+
249
+ ##
250
+ ## todo/fix: unify mapping entries
251
+ ## always lowercase !!!! (case insensitive)
252
+ ## always strip (2011-) !!!
253
+ ## always strip dots (e.g. St., F.C, etc.)
254
+
255
+ recs << rec
256
+ end
257
+ end # each_line
258
+ recs
259
+ end # method read
260
+
261
+ ### helpers
262
+ def self.split_geo( str )
263
+ ## assume city / geo tree
264
+ ## strip and squish (white)spaces
265
+ # e.g. León › Guanajuato => León › Guanajuato
266
+ str = str.strip.gsub( /[ \t]+/, ' ' )
267
+
268
+ ## split into geo tree
269
+ geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
270
+ geos = geos.map { |geo| geo.strip } ## remove all whitespaces
271
+ geos
272
+ end
273
+
274
+ end # class ClubReader
275
+
276
+
277
+ end ## module Import
278
+ end ## module SportDb