sportdb-structs 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,490 +0,0 @@
1
-
2
- module SportDb
3
- class CsvMatchParser
4
-
5
- #############
6
- # helpers
7
- def self.find_seasons( path, col: 'Season', sep: nil, headers: nil )
8
-
9
- ## check if headers incl. season if yes,has priority over col mapping
10
- ## e.g. no need to specify twice (if using headers)
11
- col = headers[:season] if headers && headers[:season]
12
-
13
- seasons = Hash.new( 0 ) ## default value is 0
14
-
15
- ## todo/fix: yes, use CsvHash.foreach - why? why not?
16
- ## use read_csv with block to switch to foreach!!!!
17
- rows = read_csv( path, sep: sep )
18
-
19
- rows.each_with_index do |row,i|
20
- puts "[#{i}] " + row.inspect if i < 2
21
-
22
- season = row[ col ] ## column name defaults to 'Season'
23
- seasons[ season ] += 1
24
- end
25
-
26
- pp seasons
27
-
28
- ## note: only return season keys/names (not hash with usage counter)
29
- seasons.keys
30
- end
31
-
32
-
33
- ##########
34
- # main machinery
35
-
36
- ## todo/fix: use a generic "global" parse_csv method - why? why not?
37
- ## def self.parse_csv( text, sep: ',' ) ## helper -lets you change the csv library in one place if needed/desired
38
- ## ## note: do NOT symbolize keys - keep them as is!!!!!!
39
- ## ## todo/fix: move "upstream" and remove symbolize keys too!!! - why? why not?
40
- ## CsvHash.parse( text, sep: sep )
41
- ## end
42
-
43
- def self.read( path, headers: nil, filters: nil, converters: nil, sep: nil )
44
- txt = File.open( path, 'r:utf-8' ) {|f| f.read } ## note: make sure to use (assume) utf-8
45
- parse( txt, headers: headers,
46
- filters: filters,
47
- converters: converters,
48
- sep: sep )
49
- end
50
-
51
- def self.parse( txt, headers: nil, filters: nil, converters: nil, sep: nil )
52
- new( txt ).parse( headers: headers,
53
- filters: filters,
54
- converters: converters,
55
- sep: sep )
56
- end
57
-
58
-
59
- def initialize( txt )
60
- @txt = txt
61
- end
62
-
63
- def parse( headers: nil, filters: nil, converters: nil, sep: nil )
64
-
65
- headers_mapping = {}
66
-
67
- rows = parse_csv( @txt, sep: sep )
68
-
69
- return [] if rows.empty? ## no rows / empty?
70
-
71
-
72
- ## fix/todo: use logger!!!!
73
- ## pp csv
74
-
75
- if headers ## use user supplied headers if present
76
- headers_mapping = headers_mapping.merge( headers )
77
- else
78
-
79
- ## note: returns an array of strings (header names) - assume all rows have the same columns/fields!!!
80
- headers = rows[0].keys
81
- pp headers
82
-
83
- # note: greece 2001-02 etc. use HT - check CVS reader row['HomeTeam'] may not be nil but an empty string?
84
- # e.g. row['HomeTeam'] || row['HT'] will NOT work for now
85
-
86
- if find_header( headers, ['Team 1']) && find_header( headers, ['Team 2'])
87
- ## assume our own football.csv format, see github.com/footballcsv
88
- headers_mapping[:team1] = find_header( headers, ['Team 1'] )
89
- headers_mapping[:team2] = find_header( headers, ['Team 2'] )
90
- headers_mapping[:date] = find_header( headers, ['Date'] )
91
- headers_mapping[:time] = find_header( headers, ['Time'] )
92
-
93
- ## check for all-in-one full time (ft) and half time (ht9 scores?
94
- headers_mapping[:score] = find_header( headers, ['FT'] )
95
- headers_mapping[:scorei] = find_header( headers, ['HT'] )
96
-
97
- headers_mapping[:round] = find_header( headers, ['Round', 'Matchday'] )
98
-
99
- ## optional headers - note: find_header returns nil if header NOT found
100
- header_stage = find_header( headers, ['Stage'] )
101
- headers_mapping[:stage] = header_stage if header_stage
102
-
103
- header_group = find_header( headers, ['Group'] )
104
- headers_mapping[:group] = header_group if header_group
105
-
106
-
107
- header_et = find_header( headers, ['ET', 'AET'] ) ## (after) extra time
108
- headers_mapping[:score_et] = header_et if header_et
109
-
110
- header_p = find_header( headers, ['P', 'PEN'] ) ## penalties
111
- headers_mapping[:score_p] = header_p if header_p
112
-
113
- header_notes = find_header( headers, ['Notes', 'Comments'] )
114
- headers_mapping[:notes] = header_notes if header_notes
115
-
116
-
117
- header_league = find_header( headers, ['League'] )
118
- headers_mapping[:league] = header_league if header_league
119
- else
120
- ## else try footballdata.uk and others
121
- headers_mapping[:team1] = find_header( headers, ['HomeTeam', 'HT', 'Home'] )
122
- headers_mapping[:team2] = find_header( headers, ['AwayTeam', 'AT', 'Away'] )
123
- headers_mapping[:date] = find_header( headers, ['Date'] )
124
- headers_mapping[:time] = find_header( headers, ['Time'] )
125
-
126
- ## note: FT = Full Time, HG = Home Goal, AG = Away Goal
127
- headers_mapping[:score1] = find_header( headers, ['FTHG', 'HG'] )
128
- headers_mapping[:score2] = find_header( headers, ['FTAG', 'AG'] )
129
-
130
- ## check for half time scores ?
131
- ## note: HT = Half Time
132
- headers_mapping[:score1i] = find_header( headers, ['HTHG'] )
133
- headers_mapping[:score2i] = find_header( headers, ['HTAG'] )
134
- end
135
- end
136
-
137
- pp headers_mapping
138
-
139
- ### todo/fix: check headers - how?
140
- ## if present HomeTeam or HT required etc.
141
- ## issue error/warn is not present
142
- ##
143
- ## puts "*** !!! wrong (unknown) headers format; cannot continue; fix it; sorry"
144
- ## exit 1
145
- ##
146
-
147
- matches = []
148
-
149
- rows.each_with_index do |row,i|
150
-
151
- ## fix/todo: use logger!!!!
152
- ## puts "[#{i}] " + row.inspect if i < 2
153
-
154
-
155
- ## todo/fix: move to its own (helper) method - filter or such!!!!
156
- if filters ## filter MUST match if present e.g. row['Season'] == '2017/2018'
157
- skip = false
158
- filters.each do |header, value|
159
- if row[ header ] != value ## e.g. row['Season']
160
- skip = true
161
- break
162
- end
163
- end
164
- next if skip ## if header values NOT matching
165
- end
166
-
167
-
168
- ## note:
169
- ## add converters after filters for now (why not before filters?)
170
- if converters ## any converters defined?
171
- ## convert single proc shortcut to array with single converter
172
- converters = [converters] if converters.is_a?( Proc )
173
-
174
- ## assumes array of procs
175
- converters.each do |converter|
176
- row = converter.call( row )
177
- end
178
- end
179
-
180
-
181
-
182
- team1 = row[ headers_mapping[ :team1 ]]
183
- team2 = row[ headers_mapping[ :team2 ]]
184
-
185
-
186
- ## check if data present - if not skip (might be empty row)
187
- ## note: (old classic) csv reader returns nil for empty fields
188
- ## new modern csv reader ALWAYS returns strings (and empty strings for data not available (n/a))
189
- if (team1.nil? || team1.empty?) &&
190
- (team2.nil? || team2.empty?)
191
- puts "*** WARN: skipping empty? row[#{i}] - no teams found:"
192
- pp row
193
- next
194
- end
195
-
196
- ## remove possible match played counters e.g. (4) (11) etc.
197
- team1 = team1.sub( /\(\d+\)/, '' ).strip
198
- team2 = team2.sub( /\(\d+\)/, '' ).strip
199
-
200
-
201
-
202
- col = row[ headers_mapping[ :time ]]
203
-
204
- if col.nil?
205
- time = nil
206
- else
207
- col = col.strip # make sure not leading or trailing spaces left over
208
-
209
- if col.empty?
210
- col =~ /^-{1,}$/ || # e.g. - or ---
211
- col =~ /^\?{1,}$/ # e.g. ? or ???
212
- ## note: allow missing / unknown date for match
213
- time = nil
214
- else
215
- if col =~ /^\d{1,2}:\d{2}$/
216
- time_fmt = '%H:%M' # e.g. 17:00 or 3:00
217
- elsif col =~ /^\d{1,2}.\d{2}$/
218
- time_fmt = '%H.%M' # e.g. 17:00 or 3:00
219
- else
220
- puts "*** !!! wrong (unknown) time format >>#{col}<<; cannot continue; fix it; sorry"
221
- ## todo/fix: add to errors/warns list - why? why not?
222
- exit 1
223
- end
224
-
225
- ## todo/check: use date object (keep string?) - why? why not?
226
- ## todo/fix: yes!! use date object!!!! do NOT use string
227
- time = Time.strptime( col, time_fmt ).strftime( '%H:%M' )
228
- end
229
- end
230
-
231
-
232
-
233
- col = row[ headers_mapping[ :date ]]
234
- col = col.strip # make sure not leading or trailing spaces left over
235
-
236
- if col.empty? ||
237
- col =~ /^-{1,}$/ || # e.g. - or ---
238
- col =~ /^\?{1,}$/ # e.g. ? or ???
239
- ## note: allow missing / unknown date for match
240
- date = nil
241
- else
242
- ## remove possible weekday or weeknumber e.g. (Fri) (4) etc.
243
- col = col.sub( /\(W?\d{1,2}\)/, '' ) ## e.g. (W11), (4), (21) etc.
244
- col = col.sub( /\(\w+\)/, '' ) ## e.g. (Fri), (Fr) etc.
245
- col = col.strip # make sure not leading or trailing spaces left over
246
-
247
- if col =~ /^\d{2}\/\d{2}\/\d{4}$/
248
- date_fmt = '%d/%m/%Y' # e.g. 17/08/2002
249
- elsif col =~ /^\d{2}\/\d{2}\/\d{2}$/
250
- date_fmt = '%d/%m/%y' # e.g. 17/08/02
251
- elsif col =~ /^\d{4}-\d{2}-\d{2}$/ ## "standard" / default date format
252
- date_fmt = '%Y-%m-%d' # e.g. 1995-08-04
253
- elsif col =~ /^\d{1,2} \w{3} \d{4}$/
254
- date_fmt = '%d %b %Y' # e.g. 8 Jul 2017
255
- elsif col =~ /^\w{3} \w{3} \d{1,2} \d{4}$/
256
- date_fmt = '%a %b %d %Y' # e.g. Sat Aug 7 1993
257
- else
258
- puts "*** !!! wrong (unknown) date format >>#{col}<<; cannot continue; fix it; sorry"
259
- ## todo/fix: add to errors/warns list - why? why not?
260
- exit 1
261
- end
262
-
263
- ## todo/check: use date object (keep string?) - why? why not?
264
- ## todo/fix: yes!! use date object!!!! do NOT use string
265
- date = Date.strptime( col, date_fmt ).strftime( '%Y-%m-%d' )
266
- end
267
-
268
-
269
- ##
270
- ## todo/fix: round might not always be just a simple integer number!!!
271
- ## might be text such as Final | Leg 1 or such!!!!
272
- round = nil
273
- ## check for (optional) round / matchday
274
- if headers_mapping[ :round ]
275
- col = row[ headers_mapping[ :round ]]
276
- ## todo: issue warning if not ? or - (and just empty string) why? why not
277
- ## (old attic) was: round = col.to_i if col =~ /^\d{1,2}$/ # check format - e.g. ignore ? or - or such non-numbers for now
278
-
279
- ## note: make round always a string for now!!!! e.g. "1", "2" too!!
280
- round = if col.nil? || col.empty? || col == '-' || col == 'n/a'
281
- ## note: allow missing round for match / defaults to nil
282
- nil
283
- else
284
- col
285
- end
286
- end
287
-
288
-
289
- score1 = nil
290
- score2 = nil
291
- score1i = nil
292
- score2i = nil
293
-
294
- ## check for full time scores ?
295
- if headers_mapping[ :score1 ] && headers_mapping[ :score2 ]
296
- ft = [ row[ headers_mapping[ :score1 ]],
297
- row[ headers_mapping[ :score2 ]] ]
298
-
299
- ## todo/fix: issue warning if not ? or - (and just empty string) why? why not
300
- score1 = ft[0].to_i if ft[0] =~ /^\d{1,2}$/
301
- score2 = ft[1].to_i if ft[1] =~ /^\d{1,2}$/
302
- end
303
-
304
- ## check for half time scores ?
305
- if headers_mapping[ :score1i ] && headers_mapping[ :score2i ]
306
- ht = [ row[ headers_mapping[ :score1i ]],
307
- row[ headers_mapping[ :score2i ]] ]
308
-
309
- ## todo/fix: issue warning if not ? or - (and just empty string) why? why not
310
- score1i = ht[0].to_i if ht[0] =~ /^\d{1,2}$/
311
- score2i = ht[1].to_i if ht[1] =~ /^\d{1,2}$/
312
- end
313
-
314
-
315
- ## check for all-in-one full time scores?
316
- if headers_mapping[ :score ]
317
- col = row[ headers_mapping[ :score ]]
318
- score = parse_score( col )
319
- if score
320
- score1 = score[0]
321
- score2 = score[1]
322
- else
323
- puts "!! ERROR - invalid score (ft) format >#{col}<:"
324
- pp row
325
- exit 1
326
- end
327
- end
328
-
329
- if headers_mapping[ :scorei ]
330
- col = row[ headers_mapping[ :scorei ]]
331
- score = parse_score( col )
332
- if score
333
- score1i = score[0]
334
- score2i = score[1]
335
- else
336
- puts "!! ERROR - invalid score (ht) format >#{col}<:"
337
- pp row
338
- exit 1
339
- end
340
- end
341
-
342
- ####
343
- ## try optional score - extra time (et) and penalities (p/pen)
344
- score1et = nil
345
- score2et = nil
346
- score1p = nil
347
- score2p = nil
348
-
349
- if headers_mapping[ :score_et ]
350
- col = row[ headers_mapping[ :score_et ]]
351
- score = parse_score( col )
352
- if score
353
- score1et = score[0]
354
- score2et = score[1]
355
- else
356
- puts "!! ERROR - invalid score (et) format >#{col}<:"
357
- pp row
358
- exit 1
359
- end
360
- end
361
-
362
- if headers_mapping[ :score_p ]
363
- col = row[ headers_mapping[ :score_p ]]
364
- score = parse_score( col )
365
- if score
366
- score1p = score[0]
367
- score2p = score[1]
368
- else
369
- puts "!! ERROR - invalid score (p) format >#{col}<:"
370
- pp row
371
- exit 1
372
- end
373
- end
374
-
375
-
376
- ## try some optional headings / columns
377
- stage = nil
378
- if headers_mapping[ :stage ]
379
- col = row[ headers_mapping[ :stage ]]
380
- ## todo/fix: check can col be nil e.g. col.nil? possible?
381
- stage = if col.nil? || col.empty? || col == '-' || col == 'n/a'
382
- ## note: allow missing stage for match / defaults to "regular"
383
- nil
384
- elsif col == '?'
385
- ## note: default explicit unknown to unknown for now AND not regular - why? why not?
386
- '?' ## todo/check: use unkown and NOT ? - why? why not?
387
- else
388
- col
389
- end
390
- end
391
-
392
- group = nil
393
- if headers_mapping[ :group ]
394
- col = row[ headers_mapping[ :group ]]
395
- ## todo/fix: check can col be nil e.g. col.nil? possible?
396
- group = if col.nil? || col.empty? || col == '-' || col == 'n/a'
397
- ## note: allow missing stage for match / defaults to "regular"
398
- nil
399
- else
400
- col
401
- end
402
- end
403
-
404
- status = nil ## e.g. AWARDED, CANCELLED, POSTPONED, etc.
405
- if headers_mapping[ :notes ]
406
- col = row[ headers_mapping[ :notes ]]
407
- ## check for optional (match) status in notes / comments
408
- status = if col.nil? || col.empty? || col == '-' || col == 'n/a'
409
- nil
410
- else
411
- StatusParser.parse( col ) # note: returns nil if no (match) status found
412
- end
413
- end
414
-
415
-
416
- league = nil
417
- league = row[ headers_mapping[ :league ]] if headers_mapping[ :league ]
418
-
419
-
420
- ## puts 'match attributes:'
421
- attributes = {
422
- date: date,
423
- time: time,
424
- team1: team1, team2: team2,
425
- score1: score1, score2: score2,
426
- score1i: score1i, score2i: score2i,
427
- score1et: score1et, score2et: score2et,
428
- score1p: score1p, score2p: score2p,
429
- round: round,
430
- stage: stage,
431
- group: group,
432
- status: status,
433
- league: league
434
- }
435
- ## pp attributes
436
-
437
- match = Sports::Match.new( **attributes )
438
- matches << match
439
- end
440
-
441
- ## pp matches
442
- matches
443
- end
444
-
445
-
446
- private
447
-
448
- def find_header( headers, candidates )
449
- ## todo/fix: use find_first from enumare of similar ?! - why? more idiomatic code?
450
-
451
- candidates.each do |candidate|
452
- return candidate if headers.include?( candidate ) ## bingo!!!
453
- end
454
- nil ## no matching header found!!!
455
- end
456
-
457
- ########
458
- # more helpers
459
- #
460
-
461
- def parse_score( str )
462
- if str.nil? ## todo/check: remove nil case - possible? - why? why not?
463
- [nil,nil]
464
- else
465
- ## remove (optional single) note/footnote/endnote markers
466
- ## e.g. (*) or (a), (b),
467
- ## or [*], [A], [1], etc.
468
- ## - allow (1) or maybe (*1) in the future - why? why not?
469
- str = str.sub( /\( [a-z*] \)
470
- |
471
- \[ [1-9a-z*] \]
472
- /ix, '' ).strip
473
-
474
- if str.empty? || str == '?' || str == '-' || str == 'n/a'
475
- [nil,nil]
476
- ### todo/check: use regex with named capture groups here - why? why not?
477
- elsif str =~ /^\d{1,2}[:-]\d{1,2}$/ ## sanity check scores format
478
- score = str.split( /[:-]/ )
479
- [score[0].to_i, score[1].to_i]
480
- else
481
- nil ## note: returns nil if invalid / unparseable format!!!
482
- end
483
- end
484
- end # method parse_score
485
-
486
-
487
-
488
- end # class CsvMatchParser
489
- end # module Sports
490
-
@@ -1,90 +0,0 @@
1
- #####################
2
- # helpers for parsing & finding match status e.g.
3
- # - cancelled / canceled
4
- # - awarded
5
- # - abandoned
6
- # - replay
7
- # etc.
8
-
9
-
10
- module SportDb
11
-
12
-
13
- ### todo/fix: move Status inside Match struct - why? why not?
14
-
15
- class Status
16
- # note: use a class as an "enum"-like namespace for now - why? why not?
17
- # move class into Match e.g. Match::Status - why? why not?
18
- CANCELLED = 'CANCELLED' # canceled (US spelling), cancelled (UK spelling) - what to use?
19
- AWARDED = 'AWARDED'
20
- POSTPONED = 'POSTPONED'
21
- ABANDONED = 'ABANDONED'
22
- REPLAY = 'REPLAY'
23
- end # class Status
24
-
25
-
26
-
27
- class StatusParser
28
-
29
- def self.parse( str )
30
- ## note: returns nil if no match found
31
- ## note: english usage - cancelled (in UK), canceled (in US)
32
- if str =~ /^(cancelled|
33
- canceled|
34
- can\.
35
- )/xi
36
- Status::CANCELLED
37
- elsif str =~ /^(awarded|
38
- awd\.
39
- )/xi
40
- Status::AWARDED
41
- elsif str =~ /^(postponed
42
- )/xi
43
- Status::POSTPONED
44
- elsif str =~ /^(abandoned|
45
- abd\.
46
- )/xi
47
- Status::ABANDONED
48
- elsif str =~ /^(replay
49
- )/xi
50
- Status::REPLAY
51
- else
52
- # no match
53
- nil
54
- end
55
- end
56
-
57
-
58
- RUN_RE = /\[
59
- (?<text>[^\]]+)
60
- \]
61
- /x
62
- def self.find!( line )
63
- ## for now check all "protected" text run blocks e.g. []
64
- ## puts "line: >#{line}<"
65
-
66
- status = nil
67
-
68
- str = line
69
- while m = str.match( RUN_RE )
70
- str = m.post_match ## keep on processing rest of line/str (a.k.a. post match string)
71
-
72
- ## check for status match
73
- match_str = m[0] ## keep a copy of the match string (for later sub)
74
- text = m[:text].strip
75
- ## puts " text: >#{text}<"
76
-
77
- status = parse( text )
78
-
79
- if status
80
- line.sub!( match_str, "[STATUS.#{status}]" )
81
- break
82
- end
83
- end # while match
84
-
85
- status
86
- end # method find!
87
- end # class StatusParser
88
-
89
- end # module SportDb
90
-
@@ -1,87 +0,0 @@
1
-
2
- module SportDb
3
- module NameHelper
4
-
5
-
6
- ## note: allow placeholder years to e.g. (-___) or (-????)
7
- ## for marking missing (to be filled in) years
8
- ## e.g. (1887-1911), (-2013),
9
- ## (1946-2001, 2013-) etc.
10
- ## todo/check: make more strict e.g. only accept 4-digit years? - why? why not?
11
- YEAR_RE = %r{\(
12
- [0-9, ?_-]+? # note: non-greedy (minimum/first) match
13
- \)}x
14
-
15
- def strip_year( name )
16
- ## check for year(s) e.g. (1887-1911), (-2013),
17
- ## (1946-2001, 2013-) etc.
18
- ## todo/check: only sub once (not global) - why? why not?
19
- name.gsub( YEAR_RE, '' ).strip
20
- end
21
-
22
- def has_year?( name ) name =~ YEAR_RE; end
23
-
24
-
25
- LANG_RE = %r{\[
26
- [a-z]{1,2} # note also allow single-letter [a] or [d] or [e] - why? why not?
27
- \]}x
28
- def strip_lang( name )
29
- name.gsub( LANG_RE, '' ).strip
30
- end
31
-
32
- def has_lang?( name ) name =~ LANG_RE; end
33
-
34
-
35
- def sanitize( name )
36
- ## check for year(s) e.g. (1887-1911), (-2013),
37
- ## (1946-2001,2013-) etc.
38
- name = strip_year( name )
39
- ## check lang codes e.g. [en], [fr], etc.
40
- name = strip_lang( name )
41
- name
42
- end
43
-
44
-
45
- ## note: also add (),’,− etc. e.g.
46
- ## Estudiantes (LP) => Estudiantes LP
47
- ## Saint Patrick’s Athletic FC => Saint Patricks Athletic FC
48
- ## Myllykosken Pallo −47 => Myllykosken Pallo 47
49
- ##
50
- ## add & too!!
51
- ## e.g. Brighton & Hove Albion => Brighton Hove Albion -- and others in England
52
-
53
- NORM_RE = %r{
54
- [.'’º/()&_−-]
55
- }x # note: in [] dash (-) if last doesn't need to get escaped
56
- ## note: remove all dots (.), dash (-), ', º, /, etc.
57
- # . U+002E (46) - FULL STOP
58
- # ' U+0027 (39) - APOSTROPHE
59
- # ’ U+2019 (8217) - RIGHT SINGLE QUOTATION MARK
60
- # º U+00BA (186) - MASCULINE ORDINAL INDICATOR
61
- # / U+002F (47) - SOLIDUS
62
- # ( U+0028 (40) - LEFT PARENTHESIS
63
- # ) U+0029 (41) - RIGHT PARENTHESIS
64
- # − U+2212 (8722) - MINUS SIGN
65
- # - U+002D (45) - HYPHEN-MINUS
66
-
67
- ## for norm(alizing) names
68
- def strip_norm( name )
69
- name.gsub( NORM_RE, '' )
70
- end
71
-
72
- def normalize( name )
73
- # note: do NOT call sanitize here (keep normalize "atomic" for reuse)
74
- name = strip_norm( name )
75
- name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
76
-
77
- ## todo/check: use our own downcase - why? why not?
78
- name = downcase_i18n( name ) ## do NOT care about upper and lowercase for now
79
- name
80
- end
81
-
82
-
83
- def variants( name ) Variant.find( name ); end
84
-
85
- end # module NameHelper
86
- end # module SportDb
87
-