sportdb-formats 2.0.2 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,119 +0,0 @@
1
-
2
- module SportDb
3
- module Import
4
- class EventInfoReader
5
- def catalog() Import.catalog; end
6
-
7
-
8
- def self.read( path )
9
- txt = File.open( path, 'r:utf-8') {|f| f.read }
10
- new( txt ).parse
11
- end
12
-
13
- def self.parse( txt )
14
- new( txt ).parse
15
- end
16
-
17
- def initialize( txt )
18
- @txt = txt
19
- end
20
-
21
- def parse
22
- recs = []
23
-
24
- parse_csv( @txt ).each do |row|
25
- league_col = row['League']
26
- season_col = row['Season'] || row['Year']
27
- dates_col = row['Dates']
28
-
29
- season = Season.parse( season_col )
30
- league = catalog.leagues.find!( league_col )
31
-
32
-
33
- dates = []
34
- if dates_col.nil? || dates_col.empty?
35
- ## do nothing; no dates - keep dates array empty
36
- else
37
- ## squish spaces
38
- dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
39
-
40
- puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
41
-
42
- ### todo/check: check what parts "Aug 15" return ???
43
- ### short form for "Aug 15 -" - works?
44
-
45
- ## todo/fix!!! - check EventInfo.include?
46
- ## now allow dates with only start_date too!! (WITHOUT end_date)
47
- parts = dates_col.split( /[ ]*[–-][ ]*/ )
48
- if parts.size == 1
49
- pp parts
50
- dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
51
- pp dates
52
- elsif parts.size == 2
53
- pp parts
54
- dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
55
- dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
56
- pp dates
57
-
58
- ## assert/check if period is less than 365 days for now
59
- diff = dates[1].to_date.jd - dates[0].to_date.jd
60
- puts "#{diff}d"
61
- if diff > 365
62
- puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
63
- exit 1
64
- end
65
- else
66
- puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
67
- pp dates_col
68
- pp parts
69
- exit 1
70
- end
71
- end
72
-
73
-
74
- teams_col = row['Clubs'] || row['Teams']
75
- goals_col = row['Goals']
76
-
77
- ## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
78
- teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
79
- goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
80
-
81
- teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
82
- goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
83
-
84
- matches_col = row['Matches']
85
- ## note: support additions in matches (played) e.g.
86
- # 132 + 63 Play-off-Spiele
87
- matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
88
-
89
- matches = if matches_col.nil? || matches_col.empty?
90
- nil
91
- else
92
- if matches_col.index( '+' ) ### check for calculations
93
- ## note: for now only supports additions
94
- matches_col.split( '+' ).reduce( 0 ) do |sum,str|
95
- sum + str.to_i
96
- end
97
- else ## assume single (integer) number
98
- matches_col.to_i
99
- end
100
- end
101
-
102
- rec = EventInfo.new( league: league,
103
- season: season,
104
- start_date: dates[0],
105
- end_date: dates[1],
106
- teams: teams,
107
- matches: matches,
108
- goals: goals
109
- )
110
- recs << rec
111
- end # each row
112
- recs
113
- end # method parse
114
- end # class EventInfoReader
115
-
116
-
117
- end ## module Import
118
- end ## module SportDb
119
-
@@ -1,289 +0,0 @@
1
- ###
2
- # todo - based on ClubReader
3
- # share GeoReader or BaseReader or such for both
4
- # plus maybe for PlayerReader too!!!
5
- #
6
- # fix/todo/cleanup - move alt_names_auto from reader to indexer!!!!
7
- # indexer now handles unaccent (variants) etc.
8
-
9
- module SportDb
10
- module Import
11
-
12
-
13
- class GroundReader
14
-
15
- def world() Import.world; end
16
-
17
-
18
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
19
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
20
- parse( txt )
21
- end
22
-
23
- def self.parse( txt )
24
- new( txt ).parse
25
- end
26
-
27
- def initialize( txt )
28
- @txt = txt
29
- end
30
-
31
-
32
- ## pattern for checking for address line e.g.
33
- ## use just one style / syntax - why? why not?
34
- ## Fischhofgasse 12 ~ 1100 Wien or
35
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
36
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
37
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
38
- (?: ~ | /{2,} | \+{2,} )
39
- (?: [ ]|$) # space or end of line
40
- }x
41
-
42
-
43
- def parse
44
- recs = []
45
- last_rec = nil
46
- headings = [] ## headings stack
47
-
48
- OutlineReader.parse( @txt ).each do |node|
49
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
50
- heading_level = node[0][1].to_i
51
- heading = node[1]
52
-
53
- puts "heading #{heading_level} >#{heading}<"
54
-
55
- ## 1) first pop headings if present
56
- while headings.size+1 > heading_level
57
- headings.pop
58
- end
59
-
60
- ## 2) add missing (hierarchy) level if
61
- while headings.size+1 < heading_level
62
- ## todo/fix: issue warning about "skipping" hierarchy level
63
- puts "!!! warn [ground reader] - skipping hierarchy level in headings "
64
- headings.push( nil )
65
- end
66
-
67
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
68
- ## keep level empty
69
- else
70
- ## note: if level is 1 assume country for now
71
- if heading_level == 1
72
- ## assume country in heading; allow all "formats" supported by parse e.g.
73
- ## Österreich • Austria (at)
74
- ## Österreich • Austria
75
- ## Austria
76
- ## Deutschland (de) • Germany
77
- country = world.countries.parse( heading )
78
- ## check country code - MUST exist for now!!!!
79
- if country.nil?
80
- puts "!!! error [ground reader] - unknown country >#{heading}< - sorry - add country to config to fix"
81
- exit 1
82
- end
83
-
84
- headings.push( country.key )
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif node[0] == :p ## paragraph with (text) lines
108
- lines = node[1]
109
- lines.each do |line|
110
- if line.start_with?( '|' )
111
- ## assume continuation with line of alternative names
112
- ## note: skip leading pipe
113
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
114
- values = values.map {|value| _norm(value) } ## squish/strip etc.
115
-
116
- last_rec.alt_names += values
117
-
118
- ## check for address line e.g.
119
- ## use just one style / syntax - why? why not?
120
- ## Fischhofgasse 12 ~ 1100 Wien or
121
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
122
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
123
- elsif line =~ ADDR_MARKER_RE
124
- ## squish line here - why? why not?
125
- last_rec.address = _squish( line )
126
- else
127
- values = line.split( ',' )
128
-
129
- rec = Ground.new
130
-
131
- col = values.shift ## get first item
132
- ## note: allow optional alt names for convenience with required canoncial name
133
- names = col.split( '|' ) # team names - allow/use pipe(|)
134
- names = names.map {|name| _norm(name) } ## squish/strip etc.
135
-
136
- value = names[0] ## canonical name
137
- alt_names = names[1..-1] ## optional (inline) alt names
138
-
139
- rec.name = value # canoncial name (global unique "beautiful/long" name)
140
- ## note: add optional (inline) alternate names if present
141
- rec.alt_names += alt_names if alt_names.size > 0
142
-
143
- ## note:
144
- ## check/todo!!!!!!!!!!!!!!!!!-
145
- ## strip year if to present e.g. (2011-)
146
- ##
147
- ## do NOT strip for defunct / historic clubs e.g.
148
- ## (1899-1910)
149
- ## or (-1914) or (-2011) etc.
150
-
151
- ###
152
- ## todo: move year out of canonical team name - why? why not?
153
-
154
- ## check if canonical name include (2011-) or similar in name
155
- ## if yes, remove (2011-) and add to (alt) names
156
- ## e.g. New York FC (2011) => New York FC
157
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
158
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
159
-
160
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
161
- rec.year = $1.to_i
162
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
163
- rec.year_end = $1.to_i
164
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
165
- rec.year = $1.to_i
166
- rec.year_end = $2.to_i
167
- else
168
- ## todo/check: warn about unknown year format
169
- end
170
- end
171
-
172
- ## todo/check - check for unknown format values
173
- ## e.g. too many values, duplicate years, etc.
174
- ## check for overwritting, etc.
175
-
176
- ## strip and squish (white)spaces
177
- # e.g. León › Guanajuato => León › Guanajuato
178
- values = values.map {|value| _squish(value) }
179
-
180
- while values.size > 0
181
- value = values.shift
182
- if value =~/^\d{4}$/ # e.g 1904
183
- ## todo/check: issue warning if year is already set!!!!!!!
184
- if rec.year
185
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
186
- pp rec
187
- exit 1
188
- end
189
- rec.year = value.to_i
190
- elsif value =~/^[0-9_]+$/ # e.g 1904
191
- ## skip capacity for now
192
- else
193
- ## assume city / geo tree
194
- ## split into geo tree
195
- geos = split_geo( value )
196
- city = geos[0]
197
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
198
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
199
- rec.district = $1.strip
200
- city = city.gsub( /\(.+?\)/, '' ).strip
201
- end
202
- rec.city = city
203
-
204
- if geos.size > 1
205
- ## cut-off city and keep the rest (of geo tree)
206
- rec.geos = geos[1..-1]
207
- end
208
- end
209
- end ## while values
210
-
211
-
212
- ###############
213
- ## use headings text for geo tree
214
-
215
- ## 1) add country if present
216
- if headings.size > 0 && headings[0]
217
- country = world.countries.find( headings[0] )
218
- rec.country = country
219
- else
220
- ## make it an error - why? why not?
221
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
222
- exit 1
223
- end
224
-
225
- ## 2) check geo tree with headings hierarchy
226
- if headings.size > 1 && headings[1]
227
- geos = split_geo( headings[1] )
228
- if rec.geos
229
- if rec.geos[0] != geos[0]
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
231
- exit 1
232
- end
233
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
234
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
235
- exit 1
236
- end
237
- else
238
- ## add missing region (state/province) from headings hierarchy
239
- rec.geos = geos
240
- end
241
- end
242
-
243
- last_rec = rec
244
-
245
- recs << rec
246
- end
247
- end # each line (in paragraph)
248
- else
249
- puts "** !!! ERROR !!! [ground reader] - unknown line type:"
250
- pp node
251
- exit 1
252
- end
253
- end
254
-
255
- recs
256
- end # method read
257
-
258
- #######################
259
- ### helpers
260
-
261
- def split_geo( str )
262
- ## assume city / geo tree
263
- ## strip and squish (white)spaces
264
- # e.g. León › Guanajuato => León › Guanajuato
265
- str = _squish( str )
266
-
267
- ## split into geo tree
268
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
269
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
270
- geos
271
- end
272
-
273
- ## norm(alize) helper - squish (spaces)
274
- ## and remove dollars ($$$)
275
- ## and remove leading and trailing spaces
276
- def _norm( str )
277
- ## only extra clean-up of dollars for now ($$$)
278
- _squish( str.gsub( '$', '' ) )
279
- end
280
-
281
- def _squish( str )
282
- str.gsub( /[ \t\u00a0]+/, ' ' ).strip
283
- end
284
-
285
- end # class GroundReader
286
-
287
-
288
- end ## module Import
289
- end ## module SportDb
@@ -1,176 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
-
6
- ## shared "higher-level" outline reader
7
- ## todo: add CountryOutlineReader - why? why not?
8
-
9
- class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeaguePageOutlineReader - why? why not?
10
-
11
- def self.read( path, season: nil ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ) {|f| f.read }
13
- parse( txt, season: season )
14
- end
15
-
16
- def self.parse( txt, season: nil )
17
- new( txt ).parse( season: season )
18
- end
19
-
20
-
21
- def initialize( txt )
22
- @txt = txt
23
- end
24
-
25
- def parse( season: nil )
26
- secs=[] # sec(tion)s
27
- OutlineReader.parse( @txt ).each do |node|
28
- if node[0] == :h1
29
- ## check for league (and stage) and season
30
- heading = node[1]
31
- values = split_league( heading )
32
- if m=values[0].match( LEAGUE_SEASON_HEADING_RE )
33
- puts "league >#{m[:league]}<, season >#{m[:season]}<"
34
-
35
- secs << { league: m[:league],
36
- season: m[:season],
37
- stage: values[1], ## note: defaults to nil if not present
38
- lines: []
39
- }
40
- else
41
- puts "** !!! ERROR - cannot match league and season in heading; season missing?"
42
- pp heading
43
- exit 1
44
- end
45
- elsif node[0] == :p ## paragraph with (text) lines
46
- lines = node[1]
47
- ## note: skip lines if no heading seen
48
- if secs.empty?
49
- puts "** !!! WARN - skipping lines (no heading):"
50
- pp lines
51
- else
52
- ## todo/check: unroll paragraphs into lines or pass along paragraphs - why? why not?
53
- secs[-1][:lines] += lines
54
- end
55
- else
56
- puts "** !!! ERROR - unknown line type; for now only heading 1 for leagues supported; sorry:"
57
- pp node
58
- exit 1
59
- end
60
- end
61
-
62
-
63
- ## pass 2 - filter seasons if filter present
64
- if season
65
- filtered_secs = []
66
- filter = norm_seasons( season )
67
- secs.each do |sec|
68
- if filter.include?( Season.parse( sec[:season] ).key )
69
- filtered_secs << sec
70
- else
71
- puts " skipping season >#{sec[:season]}< NOT matched by filter"
72
- end
73
- end
74
- secs = filtered_secs
75
- end
76
-
77
- ## pass 3 - check & map; replace inline (string with data struct record)
78
- secs.each do |sec|
79
- sec[:season] = Season.parse( sec[:season ] )
80
- sec[:league] = catalog.leagues.find!( sec[:league] )
81
-
82
- check_stage( sec[:stage] ) if sec[:stage] ## note: only check for now (no remapping etc.)
83
- end
84
-
85
- secs
86
- end # method parse
87
-
88
-
89
-
90
- def catalog() Import.catalog; end ## shortcut convenience helper
91
-
92
- ## split into league + season
93
- ## e.g. Österr. Bundesliga 2015/16 ## or 2015-16
94
- ## World Cup 2018
95
- LEAGUE_SEASON_HEADING_RE = %r{^
96
- (?<league>.+?) ## non-greedy
97
- \s+
98
- (?<season>\d{4}
99
- (?:[\/-]\d{1,4})? ## optional 2nd year in season
100
- )
101
- $}x
102
-
103
-
104
- def norm_seasons( season_or_seasons ) ## todo/check: add alias norm_seasons - why? why not?
105
-
106
- seasons = if season_or_seasons.is_a?( Array ) # is it an array already
107
- season_or_seasons
108
- elsif season_or_seasons.is_a?( Range ) # e.g. Season(1999)..Season(2001) or such
109
- season_or_seasons.to_a
110
- else ## assume - single entry - wrap in array
111
- [season_or_seasons]
112
- end
113
-
114
- seasons.map { |season| Season( season ).key }
115
- end
116
-
117
-
118
- def split_league( str ) ## todo/check: rename to parse_league(s) - why? why not?
119
- ## split into league / stage / ... e.g.
120
- ## => Österr. Bundesliga 2018/19, Regular Season
121
- ## => Österr. Bundesliga 2018/19, Championship Round
122
- ## etc.
123
- values = str.split( /[,<>‹›]/ ) ## note: allow , > < or › ‹ for now
124
- values = values.map { |value| value.strip } ## remove all whitespaces
125
- values
126
- end
127
-
128
-
129
- # note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
130
- KNOWN_STAGES = [
131
- 'Regular Season',
132
- 'Regular Stage',
133
- 'Championship Round',
134
- 'Championship Playoff', # or Championship play-off
135
- 'Relegation Round',
136
- 'Relegation Playoff',
137
- 'Play-offs',
138
- 'Playoff Stage',
139
- 'Grunddurchgang',
140
- 'Finaldurchgang - Qualifikationsgruppe',
141
- 'Finaldurchgang - Qualifikation',
142
- 'Finaldurchgang - Meistergruppe',
143
- 'Finaldurchgang - Meister',
144
- 'EL Play-off',
145
- 'Europa League Play-off',
146
- 'Europa-League-Play-offs',
147
- 'Europa League Finals',
148
- 'Playoffs - Championship',
149
- 'Playoffs - Europa League',
150
- 'Playoffs - Europa League - Finals',
151
- 'Playoffs - Relegation',
152
- 'Playoffs - Challenger',
153
- 'Finals',
154
- 'Match 6th Place', # e.g. Super League Greece 2012/13
155
-
156
- 'Apertura',
157
- 'Apertura - Liguilla',
158
- 'Clausura',
159
- 'Clausura - Liguilla',
160
-
161
- ].map {|name| name.downcase.gsub( /[^a-z]/, '' ) }
162
-
163
-
164
- def check_stage( name )
165
- # note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
166
- if KNOWN_STAGES.include?( name.downcase.gsub( /[^a-z]/, '' ) )
167
- ## everything ok
168
- else
169
- puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
170
- exit 1
171
- end
172
- end
173
-
174
- end # class LeagueOutlineReader
175
-
176
- end # module SportDb