sportdb-formats 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,90 +0,0 @@
1
- #####################
2
- # helpers for parsing & finding match status e.g.
3
- # - cancelled / canceled
4
- # - awarded
5
- # - abandoned
6
- # - replay
7
- # etc.
8
-
9
-
10
- module SportDb
11
-
12
-
13
- ### todo/fix: move Status inside Match struct - why? why not?
14
-
15
- class Status
16
- # note: use a class as an "enum"-like namespace for now - why? why not?
17
- # move class into Match e.g. Match::Status - why? why not?
18
- CANCELLED = 'CANCELLED' # canceled (US spelling), cancelled (UK spelling) - what to use?
19
- AWARDED = 'AWARDED'
20
- POSTPONED = 'POSTPONED'
21
- ABANDONED = 'ABANDONED'
22
- REPLAY = 'REPLAY'
23
- end # class Status
24
-
25
-
26
-
27
- class StatusParser
28
-
29
- def self.parse( str )
30
- ## note: returns nil if no match found
31
- ## note: english usage - cancelled (in UK), canceled (in US)
32
- if str =~ /^(cancelled|
33
- canceled|
34
- can\.
35
- )/xi
36
- Status::CANCELLED
37
- elsif str =~ /^(awarded|
38
- awd\.
39
- )/xi
40
- Status::AWARDED
41
- elsif str =~ /^(postponed
42
- )/xi
43
- Status::POSTPONED
44
- elsif str =~ /^(abandoned|
45
- abd\.
46
- )/xi
47
- Status::ABANDONED
48
- elsif str =~ /^(replay
49
- )/xi
50
- Status::REPLAY
51
- else
52
- # no match
53
- nil
54
- end
55
- end
56
-
57
-
58
- RUN_RE = /\[
59
- (?<text>[^\]]+)
60
- \]
61
- /x
62
- def self.find!( line )
63
- ## for now check all "protected" text run blocks e.g. []
64
- ## puts "line: >#{line}<"
65
-
66
- status = nil
67
-
68
- str = line
69
- while m = str.match( RUN_RE )
70
- str = m.post_match ## keep on processing rest of line/str (a.k.a. post match string)
71
-
72
- ## check for status match
73
- match_str = m[0] ## keep a copy of the match string (for later sub)
74
- text = m[:text].strip
75
- ## puts " text: >#{text}<"
76
-
77
- status = parse( text )
78
-
79
- if status
80
- line.sub!( match_str, "[STATUS.#{status}]" )
81
- break
82
- end
83
- end # while match
84
-
85
- status
86
- end # method find!
87
- end # class StatusParser
88
-
89
- end # module SportDb
90
-
@@ -1,59 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Datafile # note: keep Datafile in its own top-level module/namespace for now - why? why not?
5
-
6
- def self.read( path ) ## todo/check: use as a shortcut helper - why? why not?
7
- ## note: always assume utf-8 for now!!!
8
- File.open( path, 'r:utf-8') {|f| f.read }
9
- end
10
-
11
-
12
- ########################
13
- ## todo/fix: turn into Datafile::Bundle.new and Bundle#write/save -why? why not?
14
- class Bundle
15
- def initialize( path )
16
- @path = path
17
- @buf = String.new('')
18
- end
19
-
20
- def <<(value)
21
- if value.is_a?( Array ) ## assume array of datafiles (file paths)
22
- datafiles = value
23
- datafiles.each do |datafile|
24
- text = Datafile.read( datafile )
25
- ## todo/fix/check: move sub __END__ to Datafile.read and turn it always on - why? why not?
26
- text = text.sub( /__END__.*/m, '' ) ## note: add/allow support for __END__; use m-multiline flag
27
- @buf << text
28
- @buf << "\n\n"
29
- end
30
- else ## assume string (e.g. header, comments, etc.)
31
- text = value
32
- @buf << text
33
- @buf << "\n\n"
34
- end
35
- end
36
- alias_method :write, :<<
37
-
38
- ## todo/fix/check: write only on close? or write on every write and use close for close?
39
- def close
40
- File.open( @path, 'w:utf-8' ) do |f|
41
- f.write @buf
42
- end
43
- end
44
- end # class Bundle
45
-
46
-
47
- def self.write_bundle( path, datafiles:, header: nil )
48
- bundle = Bundle.new( path )
49
- bundle.write( header ) if header
50
- datafiles.each do |datafile|
51
- text = read( datafile )
52
- ## todo/fix/check: move sub __END__ to Datafile.read and turn it always on - why? why not?
53
- text = text.sub( /__END__.*/m, '' ) ## note: add/allow support for __END__; use m-multiline flag
54
- bundle.write( text )
55
- end
56
- bundle.close
57
- end
58
-
59
- end # module Datafile
@@ -1,119 +0,0 @@
1
-
2
- module SportDb
3
- module Import
4
- class EventInfoReader
5
- def catalog() Import.catalog; end
6
-
7
-
8
- def self.read( path )
9
- txt = File.open( path, 'r:utf-8') {|f| f.read }
10
- new( txt ).parse
11
- end
12
-
13
- def self.parse( txt )
14
- new( txt ).parse
15
- end
16
-
17
- def initialize( txt )
18
- @txt = txt
19
- end
20
-
21
- def parse
22
- recs = []
23
-
24
- parse_csv( @txt ).each do |row|
25
- league_col = row['League']
26
- season_col = row['Season'] || row['Year']
27
- dates_col = row['Dates']
28
-
29
- season = Season.parse( season_col )
30
- league = catalog.leagues.find!( league_col )
31
-
32
-
33
- dates = []
34
- if dates_col.nil? || dates_col.empty?
35
- ## do nothing; no dates - keep dates array empty
36
- else
37
- ## squish spaces
38
- dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
39
-
40
- puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
41
-
42
- ### todo/check: check what parts "Aug 15" return ???
43
- ### short form for "Aug 15 -" - works?
44
-
45
- ## todo/fix!!! - check EventInfo.include?
46
- ## now allow dates with only start_date too!! (WITHOUT end_date)
47
- parts = dates_col.split( /[ ]*[–-][ ]*/ )
48
- if parts.size == 1
49
- pp parts
50
- dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
51
- pp dates
52
- elsif parts.size == 2
53
- pp parts
54
- dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
55
- dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
56
- pp dates
57
-
58
- ## assert/check if period is less than 365 days for now
59
- diff = dates[1].to_date.jd - dates[0].to_date.jd
60
- puts "#{diff}d"
61
- if diff > 365
62
- puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
63
- exit 1
64
- end
65
- else
66
- puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
67
- pp dates_col
68
- pp parts
69
- exit 1
70
- end
71
- end
72
-
73
-
74
- teams_col = row['Clubs'] || row['Teams']
75
- goals_col = row['Goals']
76
-
77
- ## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
78
- teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
79
- goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
80
-
81
- teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
82
- goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
83
-
84
- matches_col = row['Matches']
85
- ## note: support additions in matches (played) e.g.
86
- # 132 + 63 Play-off-Spiele
87
- matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
88
-
89
- matches = if matches_col.nil? || matches_col.empty?
90
- nil
91
- else
92
- if matches_col.index( '+' ) ### check for calculations
93
- ## note: for now only supports additions
94
- matches_col.split( '+' ).reduce( 0 ) do |sum,str|
95
- sum + str.to_i
96
- end
97
- else ## assume single (integer) number
98
- matches_col.to_i
99
- end
100
- end
101
-
102
- rec = EventInfo.new( league: league,
103
- season: season,
104
- start_date: dates[0],
105
- end_date: dates[1],
106
- teams: teams,
107
- matches: matches,
108
- goals: goals
109
- )
110
- recs << rec
111
- end # each row
112
- recs
113
- end # method parse
114
- end # class EventInfoReader
115
-
116
-
117
- end ## module Import
118
- end ## module SportDb
119
-
@@ -1,289 +0,0 @@
1
- ###
2
- # todo - based on ClubReader
3
- # share GeoReader or BaseReader or such for both
4
- # plus maybe for PlayerReader too!!!
5
- #
6
- # fix/todo/cleanup - move alt_names_auto from reader to indexer!!!!
7
- # indexer now handles unaccent (variants) etc.
8
-
9
- module SportDb
10
- module Import
11
-
12
-
13
- class GroundReader
14
-
15
- def world() Import.world; end
16
-
17
-
18
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
19
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
20
- parse( txt )
21
- end
22
-
23
- def self.parse( txt )
24
- new( txt ).parse
25
- end
26
-
27
- def initialize( txt )
28
- @txt = txt
29
- end
30
-
31
-
32
- ## pattern for checking for address line e.g.
33
- ## use just one style / syntax - why? why not?
34
- ## Fischhofgasse 12 ~ 1100 Wien or
35
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
36
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
37
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
38
- (?: ~ | /{2,} | \+{2,} )
39
- (?: [ ]|$) # space or end of line
40
- }x
41
-
42
-
43
- def parse
44
- recs = []
45
- last_rec = nil
46
- headings = [] ## headings stack
47
-
48
- OutlineReader.parse( @txt ).each do |node|
49
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
50
- heading_level = node[0][1].to_i
51
- heading = node[1]
52
-
53
- puts "heading #{heading_level} >#{heading}<"
54
-
55
- ## 1) first pop headings if present
56
- while headings.size+1 > heading_level
57
- headings.pop
58
- end
59
-
60
- ## 2) add missing (hierarchy) level if
61
- while headings.size+1 < heading_level
62
- ## todo/fix: issue warning about "skipping" hierarchy level
63
- puts "!!! warn [ground reader] - skipping hierarchy level in headings "
64
- headings.push( nil )
65
- end
66
-
67
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
68
- ## keep level empty
69
- else
70
- ## note: if level is 1 assume country for now
71
- if heading_level == 1
72
- ## assume country in heading; allow all "formats" supported by parse e.g.
73
- ## Österreich • Austria (at)
74
- ## Österreich • Austria
75
- ## Austria
76
- ## Deutschland (de) • Germany
77
- country = world.countries.parse( heading )
78
- ## check country code - MUST exist for now!!!!
79
- if country.nil?
80
- puts "!!! error [ground reader] - unknown country >#{heading}< - sorry - add country to config to fix"
81
- exit 1
82
- end
83
-
84
- headings.push( country.key )
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif node[0] == :p ## paragraph with (text) lines
108
- lines = node[1]
109
- lines.each do |line|
110
- if line.start_with?( '|' )
111
- ## assume continuation with line of alternative names
112
- ## note: skip leading pipe
113
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
114
- values = values.map {|value| _norm(value) } ## squish/strip etc.
115
-
116
- last_rec.alt_names += values
117
-
118
- ## check for address line e.g.
119
- ## use just one style / syntax - why? why not?
120
- ## Fischhofgasse 12 ~ 1100 Wien or
121
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
122
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
123
- elsif line =~ ADDR_MARKER_RE
124
- ## squish line here - why? why not?
125
- last_rec.address = _squish( line )
126
- else
127
- values = line.split( ',' )
128
-
129
- rec = Ground.new
130
-
131
- col = values.shift ## get first item
132
- ## note: allow optional alt names for convenience with required canoncial name
133
- names = col.split( '|' ) # team names - allow/use pipe(|)
134
- names = names.map {|name| _norm(name) } ## squish/strip etc.
135
-
136
- value = names[0] ## canonical name
137
- alt_names = names[1..-1] ## optional (inline) alt names
138
-
139
- rec.name = value # canoncial name (global unique "beautiful/long" name)
140
- ## note: add optional (inline) alternate names if present
141
- rec.alt_names += alt_names if alt_names.size > 0
142
-
143
- ## note:
144
- ## check/todo!!!!!!!!!!!!!!!!!-
145
- ## strip year if to present e.g. (2011-)
146
- ##
147
- ## do NOT strip for defunct / historic clubs e.g.
148
- ## (1899-1910)
149
- ## or (-1914) or (-2011) etc.
150
-
151
- ###
152
- ## todo: move year out of canonical team name - why? why not?
153
-
154
- ## check if canonical name include (2011-) or similar in name
155
- ## if yes, remove (2011-) and add to (alt) names
156
- ## e.g. New York FC (2011) => New York FC
157
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
158
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
159
-
160
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
161
- rec.year = $1.to_i
162
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
163
- rec.year_end = $1.to_i
164
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
165
- rec.year = $1.to_i
166
- rec.year_end = $2.to_i
167
- else
168
- ## todo/check: warn about unknown year format
169
- end
170
- end
171
-
172
- ## todo/check - check for unknown format values
173
- ## e.g. too many values, duplicate years, etc.
174
- ## check for overwritting, etc.
175
-
176
- ## strip and squish (white)spaces
177
- # e.g. León › Guanajuato => León › Guanajuato
178
- values = values.map {|value| _squish(value) }
179
-
180
- while values.size > 0
181
- value = values.shift
182
- if value =~/^\d{4}$/ # e.g 1904
183
- ## todo/check: issue warning if year is already set!!!!!!!
184
- if rec.year
185
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
186
- pp rec
187
- exit 1
188
- end
189
- rec.year = value.to_i
190
- elsif value =~/^[0-9_]+$/ # e.g 1904
191
- ## skip capacity for now
192
- else
193
- ## assume city / geo tree
194
- ## split into geo tree
195
- geos = split_geo( value )
196
- city = geos[0]
197
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
198
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
199
- rec.district = $1.strip
200
- city = city.gsub( /\(.+?\)/, '' ).strip
201
- end
202
- rec.city = city
203
-
204
- if geos.size > 1
205
- ## cut-off city and keep the rest (of geo tree)
206
- rec.geos = geos[1..-1]
207
- end
208
- end
209
- end ## while values
210
-
211
-
212
- ###############
213
- ## use headings text for geo tree
214
-
215
- ## 1) add country if present
216
- if headings.size > 0 && headings[0]
217
- country = world.countries.find( headings[0] )
218
- rec.country = country
219
- else
220
- ## make it an error - why? why not?
221
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
222
- exit 1
223
- end
224
-
225
- ## 2) check geo tree with headings hierarchy
226
- if headings.size > 1 && headings[1]
227
- geos = split_geo( headings[1] )
228
- if rec.geos
229
- if rec.geos[0] != geos[0]
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
231
- exit 1
232
- end
233
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
234
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
235
- exit 1
236
- end
237
- else
238
- ## add missing region (state/province) from headings hierarchy
239
- rec.geos = geos
240
- end
241
- end
242
-
243
- last_rec = rec
244
-
245
- recs << rec
246
- end
247
- end # each line (in paragraph)
248
- else
249
- puts "** !!! ERROR !!! [ground reader] - unknown line type:"
250
- pp node
251
- exit 1
252
- end
253
- end
254
-
255
- recs
256
- end # method read
257
-
258
- #######################
259
- ### helpers
260
-
261
- def split_geo( str )
262
- ## assume city / geo tree
263
- ## strip and squish (white)spaces
264
- # e.g. León › Guanajuato => León › Guanajuato
265
- str = _squish( str )
266
-
267
- ## split into geo tree
268
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
269
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
270
- geos
271
- end
272
-
273
- ## norm(alize) helper - squish (spaces)
274
- ## and remove dollars ($$$)
275
- ## and remove leading and trailing spaces
276
- def _norm( str )
277
- ## only extra clean-up of dollars for now ($$$)
278
- _squish( str.gsub( '$', '' ) )
279
- end
280
-
281
- def _squish( str )
282
- str.gsub( /[ \t\u00a0]+/, ' ' ).strip
283
- end
284
-
285
- end # class GroundReader
286
-
287
-
288
- end ## module Import
289
- end ## module SportDb