sportdb-formats 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,490 @@
1
+
2
+ module SportDb
3
+ class CsvMatchParser
4
+
5
+ #############
6
+ # helpers
7
+ def self.find_seasons( path, col: 'Season', sep: nil, headers: nil )
8
+
9
+ ## check if headers incl. season if yes,has priority over col mapping
10
+ ## e.g. no need to specify twice (if using headers)
11
+ col = headers[:season] if headers && headers[:season]
12
+
13
+ seasons = Hash.new( 0 ) ## default value is 0
14
+
15
+ ## todo/fix: yes, use CsvHash.foreach - why? why not?
16
+ ## use read_csv with block to switch to foreach!!!!
17
+ rows = read_csv( path, sep: sep )
18
+
19
+ rows.each_with_index do |row,i|
20
+ puts "[#{i}] " + row.inspect if i < 2
21
+
22
+ season = row[ col ] ## column name defaults to 'Season'
23
+ seasons[ season ] += 1
24
+ end
25
+
26
+ pp seasons
27
+
28
+ ## note: only return season keys/names (not hash with usage counter)
29
+ seasons.keys
30
+ end
31
+
32
+
33
+ ##########
34
+ # main machinery
35
+
36
+ ## todo/fix: use a generic "global" parse_csv method - why? why not?
37
+ ## def self.parse_csv( text, sep: ',' ) ## helper -lets you change the csv library in one place if needed/desired
38
+ ## ## note: do NOT symbolize keys - keep them as is!!!!!!
39
+ ## ## todo/fix: move "upstream" and remove symbolize keys too!!! - why? why not?
40
+ ## CsvHash.parse( text, sep: sep )
41
+ ## end
42
+
43
+ def self.read( path, headers: nil, filters: nil, converters: nil, sep: nil )
44
+ txt = File.open( path, 'r:utf-8' ) {|f| f.read } ## note: make sure to use (assume) utf-8
45
+ parse( txt, headers: headers,
46
+ filters: filters,
47
+ converters: converters,
48
+ sep: sep )
49
+ end
50
+
51
+ def self.parse( txt, headers: nil, filters: nil, converters: nil, sep: nil )
52
+ new( txt ).parse( headers: headers,
53
+ filters: filters,
54
+ converters: converters,
55
+ sep: sep )
56
+ end
57
+
58
+
59
+ def initialize( txt )
60
+ @txt = txt
61
+ end
62
+
63
+ def parse( headers: nil, filters: nil, converters: nil, sep: nil )
64
+
65
+ headers_mapping = {}
66
+
67
+ rows = parse_csv( @txt, sep: sep )
68
+
69
+ return [] if rows.empty? ## no rows / empty?
70
+
71
+
72
+ ## fix/todo: use logger!!!!
73
+ ## pp csv
74
+
75
+ if headers ## use user supplied headers if present
76
+ headers_mapping = headers_mapping.merge( headers )
77
+ else
78
+
79
+ ## note: returns an array of strings (header names) - assume all rows have the same columns/fields!!!
80
+ headers = rows[0].keys
81
+ pp headers
82
+
83
+ # note: greece 2001-02 etc. use HT - check CVS reader row['HomeTeam'] may not be nil but an empty string?
84
+ # e.g. row['HomeTeam'] || row['HT'] will NOT work for now
85
+
86
+ if find_header( headers, ['Team 1']) && find_header( headers, ['Team 2'])
87
+ ## assume our own football.csv format, see github.com/footballcsv
88
+ headers_mapping[:team1] = find_header( headers, ['Team 1'] )
89
+ headers_mapping[:team2] = find_header( headers, ['Team 2'] )
90
+ headers_mapping[:date] = find_header( headers, ['Date'] )
91
+ headers_mapping[:time] = find_header( headers, ['Time'] )
92
+
93
+ ## check for all-in-one full time (ft) and half time (ht9 scores?
94
+ headers_mapping[:score] = find_header( headers, ['FT'] )
95
+ headers_mapping[:scorei] = find_header( headers, ['HT'] )
96
+
97
+ headers_mapping[:round] = find_header( headers, ['Round', 'Matchday'] )
98
+
99
+ ## optional headers - note: find_header returns nil if header NOT found
100
+ header_stage = find_header( headers, ['Stage'] )
101
+ headers_mapping[:stage] = header_stage if header_stage
102
+
103
+ header_group = find_header( headers, ['Group'] )
104
+ headers_mapping[:group] = header_group if header_group
105
+
106
+
107
+ header_et = find_header( headers, ['ET', 'AET'] ) ## (after) extra time
108
+ headers_mapping[:score_et] = header_et if header_et
109
+
110
+ header_p = find_header( headers, ['P', 'PEN'] ) ## penalties
111
+ headers_mapping[:score_p] = header_p if header_p
112
+
113
+ header_notes = find_header( headers, ['Notes', 'Comments'] )
114
+ headers_mapping[:notes] = header_notes if header_notes
115
+
116
+
117
+ header_league = find_header( headers, ['League'] )
118
+ headers_mapping[:league] = header_league if header_league
119
+ else
120
+ ## else try footballdata.uk and others
121
+ headers_mapping[:team1] = find_header( headers, ['HomeTeam', 'HT', 'Home'] )
122
+ headers_mapping[:team2] = find_header( headers, ['AwayTeam', 'AT', 'Away'] )
123
+ headers_mapping[:date] = find_header( headers, ['Date'] )
124
+ headers_mapping[:time] = find_header( headers, ['Time'] )
125
+
126
+ ## note: FT = Full Time, HG = Home Goal, AG = Away Goal
127
+ headers_mapping[:score1] = find_header( headers, ['FTHG', 'HG'] )
128
+ headers_mapping[:score2] = find_header( headers, ['FTAG', 'AG'] )
129
+
130
+ ## check for half time scores ?
131
+ ## note: HT = Half Time
132
+ headers_mapping[:score1i] = find_header( headers, ['HTHG'] )
133
+ headers_mapping[:score2i] = find_header( headers, ['HTAG'] )
134
+ end
135
+ end
136
+
137
+ pp headers_mapping
138
+
139
+ ### todo/fix: check headers - how?
140
+ ## if present HomeTeam or HT required etc.
141
+ ## issue error/warn is not present
142
+ ##
143
+ ## puts "*** !!! wrong (unknown) headers format; cannot continue; fix it; sorry"
144
+ ## exit 1
145
+ ##
146
+
147
+ matches = []
148
+
149
+ rows.each_with_index do |row,i|
150
+
151
+ ## fix/todo: use logger!!!!
152
+ ## puts "[#{i}] " + row.inspect if i < 2
153
+
154
+
155
+ ## todo/fix: move to its own (helper) method - filter or such!!!!
156
+ if filters ## filter MUST match if present e.g. row['Season'] == '2017/2018'
157
+ skip = false
158
+ filters.each do |header, value|
159
+ if row[ header ] != value ## e.g. row['Season']
160
+ skip = true
161
+ break
162
+ end
163
+ end
164
+ next if skip ## if header values NOT matching
165
+ end
166
+
167
+
168
+ ## note:
169
+ ## add converters after filters for now (why not before filters?)
170
+ if converters ## any converters defined?
171
+ ## convert single proc shortcut to array with single converter
172
+ converters = [converters] if converters.is_a?( Proc )
173
+
174
+ ## assumes array of procs
175
+ converters.each do |converter|
176
+ row = converter.call( row )
177
+ end
178
+ end
179
+
180
+
181
+
182
+ team1 = row[ headers_mapping[ :team1 ]]
183
+ team2 = row[ headers_mapping[ :team2 ]]
184
+
185
+
186
+ ## check if data present - if not skip (might be empty row)
187
+ ## note: (old classic) csv reader returns nil for empty fields
188
+ ## new modern csv reader ALWAYS returns strings (and empty strings for data not available (n/a))
189
+ if (team1.nil? || team1.empty?) &&
190
+ (team2.nil? || team2.empty?)
191
+ puts "*** WARN: skipping empty? row[#{i}] - no teams found:"
192
+ pp row
193
+ next
194
+ end
195
+
196
+ ## remove possible match played counters e.g. (4) (11) etc.
197
+ team1 = team1.sub( /\(\d+\)/, '' ).strip
198
+ team2 = team2.sub( /\(\d+\)/, '' ).strip
199
+
200
+
201
+
202
+ col = row[ headers_mapping[ :time ]]
203
+
204
+ if col.nil?
205
+ time = nil
206
+ else
207
+ col = col.strip # make sure not leading or trailing spaces left over
208
+
209
+ if col.empty?
210
+ col =~ /^-{1,}$/ || # e.g. - or ---
211
+ col =~ /^\?{1,}$/ # e.g. ? or ???
212
+ ## note: allow missing / unknown date for match
213
+ time = nil
214
+ else
215
+ if col =~ /^\d{1,2}:\d{2}$/
216
+ time_fmt = '%H:%M' # e.g. 17:00 or 3:00
217
+ elsif col =~ /^\d{1,2}.\d{2}$/
218
+ time_fmt = '%H.%M' # e.g. 17:00 or 3:00
219
+ else
220
+ puts "*** !!! wrong (unknown) time format >>#{col}<<; cannot continue; fix it; sorry"
221
+ ## todo/fix: add to errors/warns list - why? why not?
222
+ exit 1
223
+ end
224
+
225
+ ## todo/check: use date object (keep string?) - why? why not?
226
+ ## todo/fix: yes!! use date object!!!! do NOT use string
227
+ time = Time.strptime( col, time_fmt ).strftime( '%H:%M' )
228
+ end
229
+ end
230
+
231
+
232
+
233
+ col = row[ headers_mapping[ :date ]]
234
+ col = col.strip # make sure not leading or trailing spaces left over
235
+
236
+ if col.empty? ||
237
+ col =~ /^-{1,}$/ || # e.g. - or ---
238
+ col =~ /^\?{1,}$/ # e.g. ? or ???
239
+ ## note: allow missing / unknown date for match
240
+ date = nil
241
+ else
242
+ ## remove possible weekday or weeknumber e.g. (Fri) (4) etc.
243
+ col = col.sub( /\(W?\d{1,2}\)/, '' ) ## e.g. (W11), (4), (21) etc.
244
+ col = col.sub( /\(\w+\)/, '' ) ## e.g. (Fri), (Fr) etc.
245
+ col = col.strip # make sure not leading or trailing spaces left over
246
+
247
+ if col =~ /^\d{2}\/\d{2}\/\d{4}$/
248
+ date_fmt = '%d/%m/%Y' # e.g. 17/08/2002
249
+ elsif col =~ /^\d{2}\/\d{2}\/\d{2}$/
250
+ date_fmt = '%d/%m/%y' # e.g. 17/08/02
251
+ elsif col =~ /^\d{4}-\d{2}-\d{2}$/ ## "standard" / default date format
252
+ date_fmt = '%Y-%m-%d' # e.g. 1995-08-04
253
+ elsif col =~ /^\d{1,2} \w{3} \d{4}$/
254
+ date_fmt = '%d %b %Y' # e.g. 8 Jul 2017
255
+ elsif col =~ /^\w{3} \w{3} \d{1,2} \d{4}$/
256
+ date_fmt = '%a %b %d %Y' # e.g. Sat Aug 7 1993
257
+ else
258
+ puts "*** !!! wrong (unknown) date format >>#{col}<<; cannot continue; fix it; sorry"
259
+ ## todo/fix: add to errors/warns list - why? why not?
260
+ exit 1
261
+ end
262
+
263
+ ## todo/check: use date object (keep string?) - why? why not?
264
+ ## todo/fix: yes!! use date object!!!! do NOT use string
265
+ date = Date.strptime( col, date_fmt ).strftime( '%Y-%m-%d' )
266
+ end
267
+
268
+
269
+ ##
270
+ ## todo/fix: round might not always be just a simple integer number!!!
271
+ ## might be text such as Final | Leg 1 or such!!!!
272
+ round = nil
273
+ ## check for (optional) round / matchday
274
+ if headers_mapping[ :round ]
275
+ col = row[ headers_mapping[ :round ]]
276
+ ## todo: issue warning if not ? or - (and just empty string) why? why not
277
+ ## (old attic) was: round = col.to_i if col =~ /^\d{1,2}$/ # check format - e.g. ignore ? or - or such non-numbers for now
278
+
279
+ ## note: make round always a string for now!!!! e.g. "1", "2" too!!
280
+ round = if col.nil? || col.empty? || col == '-' || col == 'n/a'
281
+ ## note: allow missing round for match / defaults to nil
282
+ nil
283
+ else
284
+ col
285
+ end
286
+ end
287
+
288
+
289
+ score1 = nil
290
+ score2 = nil
291
+ score1i = nil
292
+ score2i = nil
293
+
294
+ ## check for full time scores ?
295
+ if headers_mapping[ :score1 ] && headers_mapping[ :score2 ]
296
+ ft = [ row[ headers_mapping[ :score1 ]],
297
+ row[ headers_mapping[ :score2 ]] ]
298
+
299
+ ## todo/fix: issue warning if not ? or - (and just empty string) why? why not
300
+ score1 = ft[0].to_i if ft[0] =~ /^\d{1,2}$/
301
+ score2 = ft[1].to_i if ft[1] =~ /^\d{1,2}$/
302
+ end
303
+
304
+ ## check for half time scores ?
305
+ if headers_mapping[ :score1i ] && headers_mapping[ :score2i ]
306
+ ht = [ row[ headers_mapping[ :score1i ]],
307
+ row[ headers_mapping[ :score2i ]] ]
308
+
309
+ ## todo/fix: issue warning if not ? or - (and just empty string) why? why not
310
+ score1i = ht[0].to_i if ht[0] =~ /^\d{1,2}$/
311
+ score2i = ht[1].to_i if ht[1] =~ /^\d{1,2}$/
312
+ end
313
+
314
+
315
+ ## check for all-in-one full time scores?
316
+ if headers_mapping[ :score ]
317
+ col = row[ headers_mapping[ :score ]]
318
+ score = parse_score( col )
319
+ if score
320
+ score1 = score[0]
321
+ score2 = score[1]
322
+ else
323
+ puts "!! ERROR - invalid score (ft) format >#{col}<:"
324
+ pp row
325
+ exit 1
326
+ end
327
+ end
328
+
329
+ if headers_mapping[ :scorei ]
330
+ col = row[ headers_mapping[ :scorei ]]
331
+ score = parse_score( col )
332
+ if score
333
+ score1i = score[0]
334
+ score2i = score[1]
335
+ else
336
+ puts "!! ERROR - invalid score (ht) format >#{col}<:"
337
+ pp row
338
+ exit 1
339
+ end
340
+ end
341
+
342
+ ####
343
+ ## try optional score - extra time (et) and penalities (p/pen)
344
+ score1et = nil
345
+ score2et = nil
346
+ score1p = nil
347
+ score2p = nil
348
+
349
+ if headers_mapping[ :score_et ]
350
+ col = row[ headers_mapping[ :score_et ]]
351
+ score = parse_score( col )
352
+ if score
353
+ score1et = score[0]
354
+ score2et = score[1]
355
+ else
356
+ puts "!! ERROR - invalid score (et) format >#{col}<:"
357
+ pp row
358
+ exit 1
359
+ end
360
+ end
361
+
362
+ if headers_mapping[ :score_p ]
363
+ col = row[ headers_mapping[ :score_p ]]
364
+ score = parse_score( col )
365
+ if score
366
+ score1p = score[0]
367
+ score2p = score[1]
368
+ else
369
+ puts "!! ERROR - invalid score (p) format >#{col}<:"
370
+ pp row
371
+ exit 1
372
+ end
373
+ end
374
+
375
+
376
+ ## try some optional headings / columns
377
+ stage = nil
378
+ if headers_mapping[ :stage ]
379
+ col = row[ headers_mapping[ :stage ]]
380
+ ## todo/fix: check can col be nil e.g. col.nil? possible?
381
+ stage = if col.nil? || col.empty? || col == '-' || col == 'n/a'
382
+ ## note: allow missing stage for match / defaults to "regular"
383
+ nil
384
+ elsif col == '?'
385
+ ## note: default explicit unknown to unknown for now AND not regular - why? why not?
386
+ '?' ## todo/check: use unkown and NOT ? - why? why not?
387
+ else
388
+ col
389
+ end
390
+ end
391
+
392
+ group = nil
393
+ if headers_mapping[ :group ]
394
+ col = row[ headers_mapping[ :group ]]
395
+ ## todo/fix: check can col be nil e.g. col.nil? possible?
396
+ group = if col.nil? || col.empty? || col == '-' || col == 'n/a'
397
+ ## note: allow missing stage for match / defaults to "regular"
398
+ nil
399
+ else
400
+ col
401
+ end
402
+ end
403
+
404
+ status = nil ## e.g. AWARDED, CANCELLED, POSTPONED, etc.
405
+ if headers_mapping[ :notes ]
406
+ col = row[ headers_mapping[ :notes ]]
407
+ ## check for optional (match) status in notes / comments
408
+ status = if col.nil? || col.empty? || col == '-' || col == 'n/a'
409
+ nil
410
+ else
411
+ StatusParser.parse( col ) # note: returns nil if no (match) status found
412
+ end
413
+ end
414
+
415
+
416
+ league = nil
417
+ league = row[ headers_mapping[ :league ]] if headers_mapping[ :league ]
418
+
419
+
420
+ ## puts 'match attributes:'
421
+ attributes = {
422
+ date: date,
423
+ time: time,
424
+ team1: team1, team2: team2,
425
+ score1: score1, score2: score2,
426
+ score1i: score1i, score2i: score2i,
427
+ score1et: score1et, score2et: score2et,
428
+ score1p: score1p, score2p: score2p,
429
+ round: round,
430
+ stage: stage,
431
+ group: group,
432
+ status: status,
433
+ league: league
434
+ }
435
+ ## pp attributes
436
+
437
+ match = Sports::Match.new( **attributes )
438
+ matches << match
439
+ end
440
+
441
+ ## pp matches
442
+ matches
443
+ end
444
+
445
+
446
+ private
447
+
448
+ def find_header( headers, candidates )
449
+ ## todo/fix: use find_first from enumare of similar ?! - why? more idiomatic code?
450
+
451
+ candidates.each do |candidate|
452
+ return candidate if headers.include?( candidate ) ## bingo!!!
453
+ end
454
+ nil ## no matching header found!!!
455
+ end
456
+
457
+ ########
458
+ # more helpers
459
+ #
460
+
461
+ def parse_score( str )
462
+ if str.nil? ## todo/check: remove nil case - possible? - why? why not?
463
+ [nil,nil]
464
+ else
465
+ ## remove (optional single) note/footnote/endnote markers
466
+ ## e.g. (*) or (a), (b),
467
+ ## or [*], [A], [1], etc.
468
+ ## - allow (1) or maybe (*1) in the future - why? why not?
469
+ str = str.sub( /\( [a-z*] \)
470
+ |
471
+ \[ [1-9a-z*] \]
472
+ /ix, '' ).strip
473
+
474
+ if str.empty? || str == '?' || str == '-' || str == 'n/a'
475
+ [nil,nil]
476
+ ### todo/check: use regex with named capture groups here - why? why not?
477
+ elsif str =~ /^\d{1,2}[:-]\d{1,2}$/ ## sanity check scores format
478
+ score = str.split( /[:-]/ )
479
+ [score[0].to_i, score[1].to_i]
480
+ else
481
+ nil ## note: returns nil if invalid / unparseable format!!!
482
+ end
483
+ end
484
+ end # method parse_score
485
+
486
+
487
+
488
+ end # class CsvMatchParser
489
+ end # module Sports
490
+
@@ -0,0 +1,90 @@
1
+ #####################
2
+ # helpers for parsing & finding match status e.g.
3
+ # - cancelled / canceled
4
+ # - awarded
5
+ # - abandoned
6
+ # - replay
7
+ # etc.
8
+
9
+
10
+ module SportDb
11
+
12
+
13
+ ### todo/fix: move Status inside Match struct - why? why not?
14
+
15
+ class Status
16
+ # note: use a class as an "enum"-like namespace for now - why? why not?
17
+ # move class into Match e.g. Match::Status - why? why not?
18
+ CANCELLED = 'CANCELLED' # canceled (US spelling), cancelled (UK spelling) - what to use?
19
+ AWARDED = 'AWARDED'
20
+ POSTPONED = 'POSTPONED'
21
+ ABANDONED = 'ABANDONED'
22
+ REPLAY = 'REPLAY'
23
+ end # class Status
24
+
25
+
26
+
27
+ class StatusParser
28
+
29
+ def self.parse( str )
30
+ ## note: returns nil if no match found
31
+ ## note: english usage - cancelled (in UK), canceled (in US)
32
+ if str =~ /^(cancelled|
33
+ canceled|
34
+ can\.
35
+ )/xi
36
+ Status::CANCELLED
37
+ elsif str =~ /^(awarded|
38
+ awd\.
39
+ )/xi
40
+ Status::AWARDED
41
+ elsif str =~ /^(postponed
42
+ )/xi
43
+ Status::POSTPONED
44
+ elsif str =~ /^(abandoned|
45
+ abd\.
46
+ )/xi
47
+ Status::ABANDONED
48
+ elsif str =~ /^(replay
49
+ )/xi
50
+ Status::REPLAY
51
+ else
52
+ # no match
53
+ nil
54
+ end
55
+ end
56
+
57
+
58
+ RUN_RE = /\[
59
+ (?<text>[^\]]+)
60
+ \]
61
+ /x
62
+ def self.find!( line )
63
+ ## for now check all "protected" text run blocks e.g. []
64
+ ## puts "line: >#{line}<"
65
+
66
+ status = nil
67
+
68
+ str = line
69
+ while m = str.match( RUN_RE )
70
+ str = m.post_match ## keep on processing rest of line/str (a.k.a. post match string)
71
+
72
+ ## check for status match
73
+ match_str = m[0] ## keep a copy of the match string (for later sub)
74
+ text = m[:text].strip
75
+ ## puts " text: >#{text}<"
76
+
77
+ status = parse( text )
78
+
79
+ if status
80
+ line.sub!( match_str, "[STATUS.#{status}]" )
81
+ break
82
+ end
83
+ end # while match
84
+
85
+ status
86
+ end # method find!
87
+ end # class StatusParser
88
+
89
+ end # module SportDb
90
+
@@ -8,14 +8,26 @@ module SportDb
8
8
  end
9
9
 
10
10
  include Logging ## e.g. logger#debug, logger#info, etc.
11
- include ParserHelper ## e.g. read_lines, etc.
11
+
12
+ def _read_lines( txt ) ## todo/check: add alias preproc_lines or build_lines or prep_lines etc. - why? why not?
13
+ ## returns an array of lines with comments and empty lines striped / removed
14
+ lines = []
15
+ txt.each_line do |line| ## preprocess
16
+ line = line.strip
17
+
18
+ next if line.empty? || line.start_with?('#') ### skip empty lines and comments
19
+ line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too
20
+ lines << line
21
+ end
22
+ lines
23
+ end
12
24
 
13
25
 
14
26
  def initialize( lines )
15
27
  # for convenience split string into lines
16
28
  ## note: removes/strips empty lines
17
29
  ## todo/check: change to text instead of array of lines - why? why not?
18
- @lines = lines.is_a?( String ) ? read_lines( lines ) : lines
30
+ @lines = lines.is_a?( String ) ? _read_lines( lines ) : lines
19
31
  end
20
32
 
21
33