sportdb-formats 0.4.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Manifest.txt +24 -4
  3. data/Rakefile +3 -3
  4. data/lib/sportdb/formats.rb +25 -2
  5. data/lib/sportdb/formats/config.rb +40 -0
  6. data/lib/sportdb/formats/datafile.rb +42 -62
  7. data/lib/sportdb/formats/datafile_package.rb +160 -0
  8. data/lib/sportdb/formats/match/conf_parser.rb +120 -0
  9. data/lib/sportdb/formats/match/mapper.rb +319 -0
  10. data/lib/sportdb/formats/match/mapper_teams.rb +23 -0
  11. data/lib/sportdb/formats/match/match_parser.rb +659 -0
  12. data/lib/sportdb/formats/match/match_parser_auto_conf.rb +202 -0
  13. data/lib/sportdb/formats/name_helper.rb +84 -0
  14. data/lib/sportdb/formats/outline_reader.rb +53 -15
  15. data/lib/sportdb/formats/package.rb +172 -160
  16. data/lib/sportdb/formats/parser_helper.rb +81 -0
  17. data/lib/sportdb/formats/score/score_formats.rb +180 -0
  18. data/lib/sportdb/formats/score/score_parser.rb +196 -0
  19. data/lib/sportdb/formats/structs/country.rb +1 -43
  20. data/lib/sportdb/formats/structs/group.rb +25 -0
  21. data/lib/sportdb/formats/structs/league.rb +7 -26
  22. data/lib/sportdb/formats/structs/match.rb +72 -51
  23. data/lib/sportdb/formats/structs/round.rb +14 -4
  24. data/lib/sportdb/formats/structs/season.rb +3 -0
  25. data/lib/sportdb/formats/structs/team.rb +144 -0
  26. data/lib/sportdb/formats/version.rb +2 -2
  27. data/test/helper.rb +83 -1
  28. data/test/test_clubs.rb +3 -3
  29. data/test/test_conf.rb +65 -0
  30. data/test/test_datafile.rb +21 -30
  31. data/test/test_match.rb +0 -6
  32. data/test/test_match_auto.rb +72 -0
  33. data/test/test_match_auto_champs.rb +45 -0
  34. data/test/test_match_auto_euro.rb +37 -0
  35. data/test/test_match_auto_worldcup.rb +61 -0
  36. data/test/test_match_champs.rb +27 -0
  37. data/test/test_match_eng.rb +26 -0
  38. data/test/test_match_euro.rb +27 -0
  39. data/test/test_match_worldcup.rb +27 -0
  40. data/test/test_name_helper.rb +67 -0
  41. data/test/test_outline_reader.rb +3 -3
  42. data/test/test_package.rb +21 -2
  43. data/test/test_package_match.rb +78 -0
  44. data/test/test_scores.rb +67 -51
  45. metadata +32 -12
  46. data/lib/sportdb/formats/scores.rb +0 -253
  47. data/lib/sportdb/formats/structs/club.rb +0 -213
  48. data/test/test_club_helpers.rb +0 -63
  49. data/test/test_datafile_match.rb +0 -65
@@ -0,0 +1,120 @@
1
+ module SportDb
2
+
3
+ class ConfParser
4
+
5
+ def self.parse( lines )
6
+ parser = new( lines )
7
+ parser.parse
8
+ end
9
+
10
+ include Logging ## e.g. logger#debug, logger#info, etc.
11
+ include ParserHelper ## e.g. read_lines, etc.
12
+
13
+
14
+ def initialize( lines )
15
+ # for convenience split string into lines
16
+ ## note: removes/strips empty lines
17
+ ## todo/check: change to text instead of array of lines - why? why not?
18
+ @lines = lines.is_a?( String ) ? read_lines( lines ) : lines
19
+ end
20
+
21
+
22
+
23
+ COUNTRY_RE = %r{ [<>‹›,]
24
+ [ ]*
25
+ (?<country>[A-Z]{2,4}) ## todo/check: allow one-letter (motor vehicle plates) or 5 letter possible?
26
+ \b}xi
27
+
28
+
29
+ ## standings table row regex matcher e.g.
30
+ ## 1 Manchester City 38 32 4 2 106-27 100
31
+ ## or 1. Manchester City 38 32 4 2 106:27 100
32
+ TABLE_RE = %r{ ^
33
+ (?:
34
+ (?<rank>\d+)\.?
35
+ |
36
+ [-]
37
+ )
38
+ [ ]+
39
+ (?<team>.+?) ## note: let's use non-greedy (MINIMUM length) match for now
40
+ [ ]+
41
+ (?<pld>\d+) ## (pl)aye(d)
42
+ [ ]+
43
+ (?<w>\d+) ## (w)ins
44
+ [ ]+
45
+ (?<d>\d+) ## (d)raws
46
+ [ ]+
47
+ (?<l>\d+) ## (l)ost
48
+ [ ]+
49
+ (?<gf>\d+) ## (g)oal (f)or
50
+ [ ]*
51
+ [:-] ## note: allow 10-10 or 10:10 or 10 - 10 or 10 : 10 etc.
52
+ [ ]*
53
+ (?<ga>\d+) ## (g)oal (a)gainst
54
+ (?: ## allow optional (g)oal (d)ifference
55
+ [ ]+
56
+ (?<gd>[±+-]?\d+) ## (g)oal (d)ifference
57
+ )?
58
+ [ ]+
59
+ (?<pts>\d+) ## (p)oin(ts)
60
+ (?: ## allow optional deductions e.g. [-7]
61
+ [ ]+
62
+ \[(?<deduction>-\d+)\]
63
+ )?
64
+ $}x
65
+
66
+ def parse
67
+ teams = {} ## convert lines to teams
68
+
69
+ @lines.each do |line|
70
+ next if line =~ /^[ -]+$/ ## skip decorative lines with dash only (e.g. ---- or - - - -) etc.
71
+
72
+
73
+ ## quick hack - check for/extract (optional) county code (for teams) first
74
+ ## allow as separators <>‹›, NOTE: includes (,) comma for now too
75
+ m = nil
76
+ country = nil
77
+ if m=COUNTRY_RE.match( line )
78
+ country = m[:country]
79
+ line = line.sub( m[0], '' ) ## replace match with nothing for now
80
+ end
81
+
82
+ if m=TABLE_RE.match( line )
83
+ puts " matching table entry >#{line}<"
84
+
85
+ name = m[:team]
86
+ rank = m[:rank] ? Integer(m[:rank]) : nil
87
+
88
+ standing = {
89
+ pld: Integer(m[:pld]),
90
+ w: Integer(m[:w]),
91
+ d: Integer(m[:d]),
92
+ l: Integer(m[:l]),
93
+ gf: Integer(m[:gf]),
94
+ ga: Integer(m[:ga]),
95
+ }
96
+ standing[ :gd ] = Integer(m[:gd].gsub(/[±+]/,'')) if m[:gd]
97
+ standing[ :pts ] = Integer(m[:pts])
98
+ standing[ :deduction ] = Integer(m[:deduction]) if m[:deduction]
99
+
100
+
101
+ ## todo/fix: track double usage - why? why not? report/raise error/exception on duplicates?
102
+ team = teams[ name ] ||= { }
103
+ team[ :country ] = country if country
104
+
105
+ team[ :rank ] = rank if rank
106
+ team[ :standing ] = standing if standing
107
+ else
108
+ ## assume team is full line
109
+ name = line.strip # note: strip leading and trailing spaces
110
+
111
+ team = teams[ name ] ||= { }
112
+ team[ :country ] = country if country
113
+ end
114
+ end
115
+
116
+ teams
117
+ end # method parse
118
+
119
+ end # class ConfParser
120
+ end # module SportDb
@@ -0,0 +1,319 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+
5
+ ##
6
+ ## note: this was/is a cut-n-page (inline) copy of TextUtils::TitleMapper2
7
+ ## see https://github.com/textkit/textutils/blob/master/textutils/lib/textutils/title_mapper2.rb
8
+
9
+
10
+ class MapperV2 ## todo/check: rename to NameMapper/TitleMapper ? why? why not??
11
+
12
+ include Logging
13
+
14
+ attr_reader :known_titles ## rename to mapping or mappings or just titles - why? why not?
15
+
16
+ ########
17
+ ## key: e.g. augsburg
18
+ ## title: e.g. FC Augsburg
19
+ ## length (of title(!!) - not regex pattern): e.g. 11 -- do not count dots (e.g. U.S.A. => 3 or 6) why? why not?
20
+ MappingStruct = Struct.new( :key, :title, :length, :pattern) ## todo/check: use (rename to) TitleStruct - why? why not??
21
+
22
+ ######
23
+ ## convenience helper - (auto)build ActiveRecord-like team records/structs
24
+ Record = Struct.new( :key, :title, :synonyms )
25
+ def build_records( txt_or_lines )
26
+ recs = []
27
+
28
+ if txt_or_lines.is_a?( String )
29
+ ## todo/fix: use ParserHelper read_lines !!! ????
30
+ txt = txt_or_lines
31
+ lines = []
32
+
33
+ txt.each_line do |line|
34
+ line = line.strip
35
+
36
+ next if line.empty? || line.start_with?( '#' ) ## note: skip empty and comment lines
37
+ lines << line
38
+ end
39
+ else
40
+ lines = txt_or_lines
41
+ end
42
+
43
+ lines.each do |line|
44
+ values = line.split( '|' )
45
+ values = values.map { |value| value.strip }
46
+
47
+ title = values[0]
48
+ ## note: quick hack - auto-generate key, that is, remove all non-ascii chars and downcase
49
+ key = title.downcase.gsub( /[^a-z]/, '' )
50
+ synonyms = values.size > 1 ? values[1..-1].join( '|' ) : nil
51
+
52
+ recs << Record.new( key, title, synonyms )
53
+ end
54
+ recs
55
+ end
56
+
57
+
58
+ def initialize( records_or_mapping, tag )
59
+ ## for convenience allow easy (auto-)convert text (lines) to records
60
+ ## as 1) text block/string or
61
+ ## 2) array of lines/strings
62
+ records_or_mapping = build_records( records_or_mapping ) if records_or_mapping.is_a?( String ) ||
63
+ (records_or_mapping.is_a?( Array ) && records_or_mapping[0].is_a?( String ))
64
+
65
+ ## build mapping lookup table
66
+ @known_titles = if records_or_mapping.is_a?( Hash ) ## assume "custom" mapping hash table (title/name=>record)
67
+ build_title_table_for_mapping( records_or_mapping )
68
+ else ## assume array of records
69
+ build_title_table_for_records( records_or_mapping )
70
+ end
71
+
72
+ ## build lookup hash by record (e.g. team/club/etc.) key
73
+ records = if records_or_mapping.is_a?( Array )
74
+ records_or_mapping
75
+ else ## assume hash (uses values assuming to be all records - note might include duplicates)
76
+ records_or_mapping.values
77
+ end
78
+
79
+ @records = records.reduce({}) { |h,rec| h[rec.key]=rec; h }
80
+
81
+
82
+ ## todo: rename tag to attrib or attrib_name - why ?? why not ???
83
+ @tag = tag # e.g. tag name use for @@brewery@@ @@team@@ etc.
84
+ end
85
+
86
+
87
+
88
+ def map_titles!( line ) ## rename to just map! - why?? why not???
89
+ begin
90
+ found = map_title_for!( @tag, line, @known_titles )
91
+ end while found
92
+ end
93
+
94
+ def find_rec!( line )
95
+ find_rec_for!( @tag, line, @records )
96
+ end
97
+
98
+ def find_recs!( line ) # note: keys (plural!) - will return array
99
+ counter = 1
100
+ recs = []
101
+
102
+ rec = find_rec_for!( "#{@tag}#{counter}", line, @records )
103
+ while rec
104
+ recs << rec
105
+ counter += 1
106
+ rec = find_rec_for!( "#{@tag}#{counter}", line, @records )
107
+ end
108
+ recs
109
+ end
110
+
111
+
112
+ private
113
+ def build_title_table_for_mapping( mapping )
114
+ known_titles = []
115
+
116
+ mapping.each do |title, rec|
117
+ m = MappingStruct.new
118
+ m.key = rec.key
119
+ m.title = title
120
+ m.length = title.length
121
+ m.pattern = Regexp.escape( title ) ## note: just use "standard" regex escape (e.g. no extras for umlauts,accents,etc.)
122
+
123
+ known_titles << m
124
+ end
125
+
126
+ ## note: sort here by length (largest goes first - best match)
127
+ known_titles = known_titles.sort { |l,r| r.length <=> l.length }
128
+ known_titles
129
+ end
130
+
131
+ def build_title_table_for_records( records )
132
+
133
+ ## build known tracks table w/ synonyms e.g.
134
+ #
135
+ # [[ 'wolfsbrug', 'VfL Wolfsburg'],
136
+ # [ 'augsburg', 'FC Augsburg'],
137
+ # [ 'augsburg', 'Augi2'],
138
+ # [ 'augsburg', 'Augi3' ],
139
+ # [ 'stuttgart', 'VfB Stuttgart']]
140
+
141
+ known_titles = []
142
+
143
+ records.each_with_index do |rec,index|
144
+
145
+ title_candidates = []
146
+ title_candidates << rec.title
147
+
148
+ title_candidates += rec.synonyms.split('|') if rec.synonyms && !rec.synonyms.empty?
149
+
150
+
151
+ ## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
152
+ # make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
153
+
154
+ titles = []
155
+ title_candidates.each do |t|
156
+ titles << t
157
+ if t =~ /\(.+\)/
158
+ extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
159
+ # note: strip leading n trailing withspaces too!
160
+ # -- todo: add squish or something if () is inline e.g. leaves two spaces?
161
+ extra_title.strip!
162
+ titles << extra_title
163
+ end
164
+ end
165
+
166
+ titles.each do |t|
167
+ m = MappingStruct.new
168
+ m.key = rec.key
169
+ m.title = t
170
+ m.length = t.length
171
+ ## note: escape for regex plus allow subs for special chars/accents
172
+ m.pattern = title_esc_regex( t )
173
+
174
+ known_titles << m
175
+ end
176
+
177
+ logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
178
+
179
+ ## note: only include code field - if defined
180
+ if rec.respond_to?(:code) && rec.code && !rec.code.empty?
181
+ m = MappingStruct.new
182
+ m.key = rec.key
183
+ m.title = rec.code
184
+ m.length = rec.code.length
185
+ m.pattern = rec.code ## note: use code for now as is (no variants allowed fow now)
186
+
187
+ known_titles << m
188
+ end
189
+ end
190
+
191
+ ## note: sort here by length (largest goes first - best match)
192
+ # exclude code and key (key should always go last)
193
+ known_titles = known_titles.sort { |l,r| r.length <=> l.length }
194
+ known_titles
195
+ end
196
+
197
+
198
+
199
+ def map_title_for!( tag, line, mappings )
200
+ mappings.each do |mapping|
201
+ key = mapping.key
202
+ pattern = mapping.pattern
203
+ ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
204
+ ## (thus add it, allows match for Benfica Lis. for example - note . at the end)
205
+
206
+ ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
207
+ re = /\b#{pattern}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
208
+ if line =~ re
209
+ logger.debug " match for #{tag.downcase} >#{key}< >#{pattern}<"
210
+ # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
211
+ line.sub!( re, "@@oo#{key}oo@@ " ) # NB: add one space char at end
212
+ return true # break out after first match (do NOT continue)
213
+ end
214
+ end
215
+
216
+ false
217
+ end
218
+
219
+
220
+ def find_rec_for!( tag, line, records )
221
+ re = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
222
+
223
+ if line =~ re
224
+ key = $1
225
+ logger.debug " #{tag.downcase}: >#{key}<"
226
+
227
+ line.sub!( re, "[#{tag.upcase}]" )
228
+
229
+ records[ key ] ## note: map key to record (using records hash table mapping)
230
+ else
231
+ nil
232
+ end
233
+ end # method find_key_for!
234
+
235
+
236
+ ####
237
+ # title helper cut-n-paste copy from TextUtils
238
+ ## see https://github.com/textkit/textutils/blob/master/textutils/lib/textutils/helper/title_helper.rb
239
+ def title_esc_regex( title_unescaped )
240
+
241
+ ## escape regex special chars e.g.
242
+ # . to \. and
243
+ # ( to \(
244
+ # ) to \)
245
+ # ? to \? -- zero or one
246
+ # * to \* -- zero or more
247
+ # + to \+ -- one or more
248
+ # $ to \$ -- end of line
249
+ # ^ to \^ -- start of line etc.
250
+
251
+ ### add { and } ???
252
+ ### add [ and ] ???
253
+ ### add \ too ???
254
+ ### add | too ???
255
+
256
+ # e.g. Benfica Lis.
257
+ # e.g. Club Atlético Colón (Santa Fe)
258
+ # e.g. Bauer Anton (????)
259
+
260
+ ## NB: cannot use Regexp.escape! will escape space '' to '\ '
261
+ ## title = Regexp.escape( title_unescaped )
262
+ title = title_unescaped.gsub( '.', '\.' )
263
+ title = title.gsub( '(', '\(' )
264
+ title = title.gsub( ')', '\)' )
265
+ title = title.gsub( '?', '\?' )
266
+ title = title.gsub( '*', '\*' )
267
+ title = title.gsub( '+', '\+' )
268
+ title = title.gsub( '$', '\$' )
269
+ title = title.gsub( '^', '\^' )
270
+
271
+ ## match accented char with or without accents
272
+ ## add (ü|ue) etc.
273
+ ## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
274
+
275
+ ## todo: add some more
276
+ ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
277
+ ##
278
+ ## reuse for all readers!
279
+
280
+ alternatives = [
281
+ ['-', '(-| )'], ## e.g. Blau-Weiß Linz
282
+ ['æ', '(æ|ae)'], ## e.g.
283
+ ['ä', '(ä|ae)'], ## e.g.
284
+ ['Ö', '(Ö|Oe)'], ## e.g. Österreich
285
+ ['ö', '(ö|oe)'], ## e.g. Mönchengladbach
286
+ ['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz
287
+ ['ü', '(ü|ue)'], ## e.g.
288
+
289
+ ['á', '(á|a)'], ## e.g. Bogotá, Sársfield
290
+ ['ã', '(ã|a)'], ## e.g São Paulo
291
+ ['ç', '(ç|c)'], ## e.g. Fenerbahçe
292
+ ['é', '(é|e)'], ## e.g. Vélez
293
+ ['ê', '(ê|e)'], ## e.g. Grêmio
294
+ ['ï', '(ï|i)' ], ## e.g. El Djazaïr
295
+ ['ñ', '(ñ|n)'], ## e.g. Porteño
296
+ ['ň', '(ň|n)'], ## e.g. Plzeň
297
+ ['ó', '(ó|o)'], ## e.g. Colón
298
+ ['ō', '(ō|o)'], # # e.g. Tōkyō
299
+ ['ș', '(ș|s)'], ## e.g. Bucarești
300
+ ['ú', '(ú|u)'] ## e.g. Fútbol
301
+ ]
302
+
303
+ ### fix/todo: check for dot+space e.g. . and make dot optional
304
+ ##
305
+ # e.g. make dot (.) optional plus allow alternative optional space e.g.
306
+ # -- for U.S.A. => allow USA or U S A
307
+ #
308
+ ## e.g. U. de G. or U de G or U.de G. ??
309
+ ## collect some more (real-world) examples first!!!!!
310
+
311
+ alternatives.each do |alt|
312
+ title = title.gsub( alt[0], alt[1] )
313
+ end
314
+
315
+ title
316
+ end
317
+
318
+ end # class MapperV2
319
+ end # module SportDb
@@ -0,0 +1,23 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+
5
+ class TeamMapper
6
+ def initialize( records_or_mapping )
7
+ @mapper = MapperV2.new( records_or_mapping, 'team' )
8
+ end
9
+
10
+ def find_teams!( line ) # Note: returns an array - note: plural! (teamsssss)
11
+ @mapper.find_recs!( line )
12
+ end
13
+
14
+ def find_team!( line ) # Note: returns key (string or nil)
15
+ @mapper.find_rec!( line )
16
+ end
17
+
18
+ def map_teams!( line )
19
+ @mapper.map_titles!( line )
20
+ end
21
+ end # class TeamMapper
22
+
23
+ end # module SportDb