sportdb-formats 1.0.6 → 1.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +6 -33
- data/Rakefile +2 -5
- data/lib/sportdb/formats.rb +54 -70
- data/lib/sportdb/formats/country/country_index.rb +2 -2
- data/lib/sportdb/formats/event/event_index.rb +141 -0
- data/lib/sportdb/formats/event/event_reader.rb +183 -0
- data/lib/sportdb/formats/league/league_index.rb +22 -18
- data/lib/sportdb/formats/league/league_outline_reader.rb +45 -13
- data/lib/sportdb/formats/league/league_reader.rb +7 -1
- data/lib/sportdb/formats/match/match_parser.rb +101 -111
- data/lib/sportdb/formats/package.rb +59 -11
- data/lib/sportdb/formats/parser_helper.rb +11 -2
- data/lib/sportdb/formats/team/club_index.rb +13 -11
- data/lib/sportdb/formats/team/club_index_history.rb +134 -0
- data/lib/sportdb/formats/team/club_reader_history.rb +203 -0
- data/lib/sportdb/formats/team/club_reader_props.rb +20 -5
- data/lib/sportdb/formats/version.rb +2 -2
- data/test/helper.rb +51 -81
- data/test/test_club_index_history.rb +107 -0
- data/test/test_club_reader_history.rb +212 -0
- data/test/test_datafile_package.rb +1 -1
- data/test/test_regex.rb +25 -7
- metadata +9 -78
- data/lib/sportdb/formats/config.rb +0 -40
- data/lib/sportdb/formats/match/match_parser_csv.rb +0 -314
- data/lib/sportdb/formats/name_helper.rb +0 -84
- data/lib/sportdb/formats/score/score_formats.rb +0 -220
- data/lib/sportdb/formats/score/score_parser.rb +0 -202
- data/lib/sportdb/formats/season_utils.rb +0 -27
- data/lib/sportdb/formats/structs/country.rb +0 -31
- data/lib/sportdb/formats/structs/group.rb +0 -18
- data/lib/sportdb/formats/structs/league.rb +0 -37
- data/lib/sportdb/formats/structs/match.rb +0 -151
- data/lib/sportdb/formats/structs/matchlist.rb +0 -220
- data/lib/sportdb/formats/structs/round.rb +0 -25
- data/lib/sportdb/formats/structs/season.rb +0 -123
- data/lib/sportdb/formats/structs/standings.rb +0 -247
- data/lib/sportdb/formats/structs/team.rb +0 -150
- data/lib/sportdb/formats/structs/team_usage.rb +0 -88
- data/test/test_clubs.rb +0 -40
- data/test/test_conf.rb +0 -65
- data/test/test_csv_match_parser.rb +0 -114
- data/test/test_csv_match_parser_utils.rb +0 -20
- data/test/test_csv_reader.rb +0 -31
- data/test/test_match.rb +0 -30
- data/test/test_match_auto.rb +0 -72
- data/test/test_match_auto_champs.rb +0 -45
- data/test/test_match_auto_euro.rb +0 -37
- data/test/test_match_auto_worldcup.rb +0 -61
- data/test/test_match_champs.rb +0 -27
- data/test/test_match_eng.rb +0 -26
- data/test/test_match_euro.rb +0 -27
- data/test/test_match_worldcup.rb +0 -27
- data/test/test_name_helper.rb +0 -67
- data/test/test_scores.rb +0 -122
- data/test/test_season.rb +0 -62
@@ -13,12 +13,22 @@ module SportDb
|
|
13
13
|
## leagues.txt or leagues_en.txt
|
14
14
|
## remove support for en.leagues.txt - why? why not?
|
15
15
|
LEAGUES_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
16
|
-
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.
|
16
|
+
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.leagues.txt
|
17
17
|
leagues
|
18
18
|
(?:_[a-z0-9_-]+)?
|
19
19
|
\.txt$
|
20
20
|
}x
|
21
21
|
|
22
|
+
## seasons.txt or seasons_en.txt
|
23
|
+
## remove support for br.seasons.txt - why? why not?
|
24
|
+
SEASONS_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
25
|
+
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.seasons.txt
|
26
|
+
seasons
|
27
|
+
(?:_[a-z0-9_-]+)?
|
28
|
+
\.txt$
|
29
|
+
}x
|
30
|
+
|
31
|
+
|
22
32
|
## clubs.txt or clubs_en.txt
|
23
33
|
## remove support for en.clubs.txt - why? why not?
|
24
34
|
CLUBS_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
@@ -35,12 +45,22 @@ module SportDb
|
|
35
45
|
\.wiki\.txt$
|
36
46
|
}x
|
37
47
|
|
38
|
-
|
48
|
+
## todo/fix: rename to CLUBS too e.g. CLUBS_PROPS to reflect filename - why? why not?
|
49
|
+
CLUBS_PROPS_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
39
50
|
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.clubs.props.txt
|
40
51
|
clubs
|
41
52
|
(?:_[a-z0-9_-]+)?
|
42
53
|
\.props\.txt$
|
43
54
|
}x
|
55
|
+
CLUB_PROPS_RE = CLUBS_PROPS_RE ## add alias for now (fix later - why? why not?)
|
56
|
+
|
57
|
+
|
58
|
+
CLUBS_HISTORY_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
59
|
+
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.clubs.history.txt
|
60
|
+
clubs
|
61
|
+
(?:_[a-z0-9_-]+)?
|
62
|
+
\.history\.txt$
|
63
|
+
}x
|
44
64
|
|
45
65
|
## teams.txt or teams_history.txt
|
46
66
|
TEAMS_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
@@ -49,6 +69,8 @@ module SportDb
|
|
49
69
|
\.txt$
|
50
70
|
}x
|
51
71
|
|
72
|
+
|
73
|
+
### todo/fix: change SEASON_RE to SEASON_KEY_RE (avoid confusion w/ SEASONS_RE for datafile?) - why? why not? !!!!!!!
|
52
74
|
### season folder:
|
53
75
|
## e.g. /2019-20 or
|
54
76
|
## year-only e.g. /2019 or
|
@@ -73,6 +95,10 @@ module SportDb
|
|
73
95
|
/[a-z0-9_.-]+\.csv$ ## note: allow dot (.) too e.g /eng.1.csv
|
74
96
|
}x
|
75
97
|
|
98
|
+
### add "generic" pattern to find all csv datafiles
|
99
|
+
CSV_RE = %r{ (?: ^|/ )
|
100
|
+
[a-z0-9_.-]+\.csv$ ## note: allow dot (.) too e.g /eng.1.csv
|
101
|
+
}x
|
76
102
|
|
77
103
|
|
78
104
|
## move class-level "static" finders to DirPackage (do NOT work for now for zip packages) - why? why not?
|
@@ -96,16 +122,22 @@ module SportDb
|
|
96
122
|
def self.find_teams( path, pattern: TEAMS_RE ) find( path, pattern ); end
|
97
123
|
def self.match_teams( path ) TEAMS_RE.match( path ); end
|
98
124
|
|
99
|
-
def self.find_clubs( path, pattern: CLUBS_RE )
|
100
|
-
def self.find_clubs_wiki( path, pattern: CLUBS_WIKI_RE )
|
125
|
+
def self.find_clubs( path, pattern: CLUBS_RE ) find( path, pattern ); end
|
126
|
+
def self.find_clubs_wiki( path, pattern: CLUBS_WIKI_RE ) find( path, pattern ); end
|
127
|
+
def self.find_clubs_history( path, pattern: CLUBS_HISTORY_RE ) find( path, pattern ); end
|
101
128
|
|
102
|
-
def self.match_clubs( path )
|
103
|
-
def self.match_clubs_wiki( path )
|
104
|
-
def self.
|
129
|
+
def self.match_clubs( path ) CLUBS_RE.match( path ); end
|
130
|
+
def self.match_clubs_wiki( path ) CLUBS_WIKI_RE.match( path ); end
|
131
|
+
def self.match_clubs_history( path ) CLUBS_HISTORY_RE.match( path); end
|
132
|
+
def self.match_clubs_props( path, pattern: CLUBS_PROPS_RE ) pattern.match( path ); end
|
105
133
|
|
106
134
|
def self.find_leagues( path, pattern: LEAGUES_RE ) find( path, pattern ); end
|
107
135
|
def self.match_leagues( path ) LEAGUES_RE.match( path ); end
|
108
136
|
|
137
|
+
def self.find_seasons( path, pattern: SEASONS_RE ) find( path, pattern ); end
|
138
|
+
def self.match_seasons( path ) SEASONS_RE.match( path ); end
|
139
|
+
|
140
|
+
|
109
141
|
def self.find_conf( path, pattern: CONF_RE ) find( path, pattern ); end
|
110
142
|
def self.match_conf( path ) CONF_RE.match( path ); end
|
111
143
|
|
@@ -118,6 +150,7 @@ module SportDb
|
|
118
150
|
end
|
119
151
|
## add match_match and match_match_csv - why? why not?
|
120
152
|
|
153
|
+
|
121
154
|
class << self
|
122
155
|
alias_method :match_teams?, :match_teams
|
123
156
|
alias_method :teams?, :match_teams
|
@@ -128,12 +161,21 @@ module SportDb
|
|
128
161
|
alias_method :match_clubs_wiki?, :match_clubs_wiki
|
129
162
|
alias_method :clubs_wiki?, :match_clubs_wiki
|
130
163
|
|
131
|
-
alias_method :
|
132
|
-
alias_method :
|
164
|
+
alias_method :match_clubs_history?, :match_clubs_history
|
165
|
+
alias_method :clubs_history?, :match_clubs_history
|
166
|
+
|
167
|
+
alias_method :match_club_props, :match_clubs_props
|
168
|
+
alias_method :match_club_props?, :match_clubs_props
|
169
|
+
alias_method :club_props?, :match_clubs_props
|
170
|
+
alias_method :match_clubs_props?, :match_clubs_props
|
171
|
+
alias_method :clubs_props?, :match_clubs_props
|
133
172
|
|
134
173
|
alias_method :match_leagues?, :match_leagues
|
135
174
|
alias_method :leagues?, :match_leagues
|
136
175
|
|
176
|
+
alias_method :match_seasons?, :match_seasons
|
177
|
+
alias_method :seasons?, :match_seasons
|
178
|
+
|
137
179
|
alias_method :match_conf?, :match_conf
|
138
180
|
alias_method :conf?, :match_conf
|
139
181
|
end
|
@@ -212,11 +254,17 @@ module SportDb
|
|
212
254
|
end
|
213
255
|
end
|
214
256
|
def each_match_csv( &blk ) each( pattern: MATCH_CSV_RE, &blk ); end
|
257
|
+
def each_csv( &blk ) each( pattern: CSV_RE, &blk ); end
|
258
|
+
|
215
259
|
def each_club_props( &blk ) each( pattern: CLUB_PROPS_RE, &blk ); end
|
216
260
|
|
217
261
|
def each_leagues( &blk ) each( pattern: LEAGUES_RE, &blk ); end
|
218
262
|
def each_clubs( &blk ) each( pattern: CLUBS_RE, &blk ); end
|
219
263
|
def each_clubs_wiki( &blk ) each( pattern: CLUBS_WIKI_RE, &blk ); end
|
264
|
+
def each_clubs_history( &blk ) each( pattern: CLUBS_HISTORY_RE, &blk ); end
|
265
|
+
|
266
|
+
def each_seasons( &blk ) each( pattern: SEASONS_RE, &blk ); end
|
267
|
+
|
220
268
|
|
221
269
|
## return all match datafile entries
|
222
270
|
def match( format: 'txt' )
|
@@ -287,13 +335,13 @@ module SportDb
|
|
287
335
|
## filter.skip? filter.include? ( season_sason_key )?
|
288
336
|
## fiteer.before?( season_key ) etc.
|
289
337
|
## find some good method names!!!!
|
290
|
-
season_start = start ?
|
338
|
+
season_start = start ? Season( start ) : nil
|
291
339
|
|
292
340
|
h = {}
|
293
341
|
match( format: format ).each do |entry|
|
294
342
|
## note: assume last directory in datafile path is the season part/key
|
295
343
|
season_q = File.basename( File.dirname( entry.name ))
|
296
|
-
season =
|
344
|
+
season = Season.parse( season_q ) ## normalize season
|
297
345
|
|
298
346
|
## skip if start season before this season
|
299
347
|
next if season_start && season_start.start_year > season.start_year
|
@@ -18,10 +18,19 @@ module SportDb
|
|
18
18
|
|
19
19
|
|
20
20
|
def is_round?( line )
|
21
|
-
## note: =~
|
22
|
-
|
21
|
+
## note: =~ returns nil if not match found, and 0,1, etc for match
|
22
|
+
|
23
|
+
## note: allow "free standing" leg 1 and leg 2 too
|
24
|
+
## (e.g. Hinspiel, Rückspiel etc. used for now in Relegation, for example)
|
25
|
+
## note ONLY allowed if "free standing", that is, full line with nothing else
|
26
|
+
## use "custom" regex for special case for now
|
27
|
+
## avoids match HIN in PascHINg, for example (hin in german for leg 1)
|
28
|
+
line =~ SportDb.lang.regex_round ||
|
29
|
+
line =~ /^(#{SportDb.lang.leg1})$/i ||
|
30
|
+
line =~ /^(#{SportDb.lang.leg2})$/i
|
23
31
|
end
|
24
32
|
|
33
|
+
|
25
34
|
def is_knockout_round?( line )
|
26
35
|
|
27
36
|
## todo: check for adding ignore case for regex (e.g. 1st leg/1st Leg)
|
@@ -54,6 +54,7 @@ class ClubIndex
|
|
54
54
|
## normalize( name )
|
55
55
|
|
56
56
|
def strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
|
57
|
+
## change/rename to strip_wiki_qualifier or such - why? why not?
|
57
58
|
## note: strip disambiguationn qualifier from wikipedia page name if present
|
58
59
|
## note: only remove year and foot... for now
|
59
60
|
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
@@ -178,22 +179,24 @@ class ClubIndex
|
|
178
179
|
## todo/fix/check: use rename to find_canon or find_canonical() or something??
|
179
180
|
## remove (getting used?) - why? why not?
|
180
181
|
def []( name ) ## lookup by canoncial name only; todo/fix: add find alias why? why not?
|
182
|
+
puts "WARN!! do not use ClubIndex#[] for lookup >#{name}< - will get removed!!!"
|
181
183
|
@clubs[ name ]
|
182
184
|
end
|
183
185
|
|
184
186
|
|
185
|
-
## todo/fix/check: return empty array if no match!!!
|
186
|
-
## and NOT nil (add || []) - why? why not?
|
187
187
|
def match( name )
|
188
|
+
# note: returns empty array (e.g. []) if no match and NOT nil
|
188
189
|
name = normalize( name )
|
189
|
-
m = @clubs_by_name[ name ]
|
190
|
+
m = @clubs_by_name[ name ] || []
|
190
191
|
|
191
192
|
## no match - retry with unaccented variant if different
|
192
193
|
## e.g. example is Preussen Münster (with mixed accent and unaccented letters) that would go unmatched for now
|
193
194
|
## Preussen Münster => preussenmünster (norm) => preussenmunster (norm+unaccent)
|
194
|
-
if m.
|
195
|
+
if m.empty?
|
195
196
|
name2 = unaccent( name )
|
196
|
-
|
197
|
+
if name2 != name
|
198
|
+
m = @clubs_by_name[ name2 ] || []
|
199
|
+
end
|
197
200
|
end
|
198
201
|
m
|
199
202
|
end
|
@@ -227,10 +230,8 @@ class ClubIndex
|
|
227
230
|
country = country( country )
|
228
231
|
|
229
232
|
## note: match must for now always include name
|
230
|
-
|
231
|
-
|
232
|
-
m = nil if m.empty? ## note: reset to nil if no more matches
|
233
|
-
end
|
233
|
+
## filter by country
|
234
|
+
m = m.select { |club| club.country.key == country.key }
|
234
235
|
end
|
235
236
|
m
|
236
237
|
end
|
@@ -263,7 +264,7 @@ class ClubIndex
|
|
263
264
|
|
264
265
|
m = match_by( name: name, country: country )
|
265
266
|
|
266
|
-
if m.
|
267
|
+
if m.empty?
|
267
268
|
## (re)try with second country - quick hacks for known leagues
|
268
269
|
## todo/fix: add league flag to activate!!! - why? why not
|
269
270
|
m = match_by( name: name, country: 'wal' ) if country.key == 'eng'
|
@@ -272,6 +273,7 @@ class ClubIndex
|
|
272
273
|
m = match_by( name: name, country: 'mc' ) if country.key == 'fr'
|
273
274
|
m = match_by( name: name, country: 'li' ) if country.key == 'ch'
|
274
275
|
m = match_by( name: name, country: 'ca' ) if country.key == 'us'
|
276
|
+
m = match_by( name: name, country: 'nz' ) if country.key == 'au'
|
275
277
|
end
|
276
278
|
else ## try "global" search - no country passed in
|
277
279
|
m = match( name )
|
@@ -279,7 +281,7 @@ class ClubIndex
|
|
279
281
|
|
280
282
|
|
281
283
|
club = nil
|
282
|
-
if m.
|
284
|
+
if m.empty?
|
283
285
|
## puts "** !!! WARN !!! no match for club >#{name}<"
|
284
286
|
elsif m.size > 1
|
285
287
|
puts "** !!! ERROR - too many matches (#{m.size}) for club >#{name}<:"
|
@@ -0,0 +1,134 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
|
7
|
+
class ClubHistoryIndex
|
8
|
+
|
9
|
+
def self.build( path )
|
10
|
+
pack = Package.new( path ) ## lets us use direcotry or zip archive
|
11
|
+
|
12
|
+
recs = []
|
13
|
+
pack.each_clubs_history do |entry|
|
14
|
+
recs += ClubHistoryReader.parse( entry.read )
|
15
|
+
end
|
16
|
+
recs
|
17
|
+
|
18
|
+
index = new
|
19
|
+
index.add( recs )
|
20
|
+
index
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
def catalog() Import.catalog; end
|
26
|
+
|
27
|
+
## note: keep name history for now separate from
|
28
|
+
## from club struct - why? why not?
|
29
|
+
## later yes, yes, yes, merge name history into club struct!!!!!
|
30
|
+
##
|
31
|
+
## for now the name history is experimental
|
32
|
+
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@clubs = {} ## clubs (indexed) by canonical name
|
36
|
+
@errors = []
|
37
|
+
end
|
38
|
+
|
39
|
+
attr_reader :errors
|
40
|
+
def errors?() @errors.empty? == false; end
|
41
|
+
|
42
|
+
def mappings() @clubs; end ## todo/check: rename to records or histories or something - why? why not?
|
43
|
+
|
44
|
+
|
45
|
+
def add_history( club_rec, keyword, season, args )
|
46
|
+
## note use season obj for now (and NOT key) - why? why not?
|
47
|
+
rec = @clubs[ club_rec.name ] ||= []
|
48
|
+
|
49
|
+
rec << [season, [keyword, args]]
|
50
|
+
|
51
|
+
## note: always keep records sorted by season_key for now
|
52
|
+
## check if 2010 and 2010/11 is in order using alpha sort?? (see argentina)
|
53
|
+
rec.sort! { |l,r| r[0] <=> l[0] }
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def add( rec_or_recs ) ## add club record / alt_names
|
58
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
59
|
+
|
60
|
+
recs.each do |rec|
|
61
|
+
|
62
|
+
keyword = rec[0]
|
63
|
+
season_key = rec[1]
|
64
|
+
args = rec[2..-1] ## get rest of args e.g. one, two or more
|
65
|
+
|
66
|
+
## note: for now only add (re)name history season records,
|
67
|
+
## that is, skip MERGE and BANKRUPT for now
|
68
|
+
## and incl. only RENAME, REFORM, MOVE for now
|
69
|
+
next if ['MERGE', 'BANKRUPT'].include?( keyword )
|
70
|
+
|
71
|
+
|
72
|
+
name_old = strip_geo( args[0][0] ) ## note: strip optional geo part from name
|
73
|
+
name_new = strip_geo( args[1][0] )
|
74
|
+
|
75
|
+
country_old = args[0][1]
|
76
|
+
country_new = args[1][1]
|
77
|
+
|
78
|
+
club_old = catalog.clubs.find_by!( name: name_old, country: country_old )
|
79
|
+
club_new = catalog.clubs.find_by!( name: name_new, country: country_new )
|
80
|
+
|
81
|
+
## note use season obj for now (and NOT key) - why? why not?
|
82
|
+
season = Season.parse( season_key )
|
83
|
+
|
84
|
+
## todo/check:
|
85
|
+
## check if club_old and club_new reference different club record!!
|
86
|
+
## examples - RB II -> Liefering ?? or
|
87
|
+
## FC Pasching -> OOE Juniors ??
|
88
|
+
## Austria Salzburg -> RB Salburg ??
|
89
|
+
## for now always add name history to both - why? why not?
|
90
|
+
|
91
|
+
add_history( club_old, keyword, season, args )
|
92
|
+
## note: allow for now different club references
|
93
|
+
## but maybe warn later - why? why not?
|
94
|
+
## add history to both for now
|
95
|
+
add_history( club_new, keyword, season, args ) if club_old != club_new
|
96
|
+
end # each rec
|
97
|
+
end # method add
|
98
|
+
|
99
|
+
|
100
|
+
#### todo/check: move as method to club struct later - to always use club reference
|
101
|
+
## returns (simply) name as string for now or nil - why? why not?
|
102
|
+
#
|
103
|
+
# history entry example
|
104
|
+
# Arsenal FC"=>
|
105
|
+
# [[1927/28, ["RENAME", [["The Arsenal FC, London", "eng"], ["Arsenal FC", "eng"]]]],
|
106
|
+
# [1914/15, ["RENAME", [["Woolwich Arsenal FC, London", "eng"], ["The Arsenal FC", "eng"]]]],
|
107
|
+
# [1892/93, ["RENAME", [["Royal Arsenal FC, London", "eng"], ["Woolwich Arsenal FC", "eng"]]]]],
|
108
|
+
def find_name_by( name:, season: )
|
109
|
+
recs = @clubs[ name ]
|
110
|
+
if recs
|
111
|
+
season = Season( season ) ## make sure season is a season obj (and NOT a string)
|
112
|
+
## check season records for name; use linear search (assume only few records)
|
113
|
+
recs.each do |rec|
|
114
|
+
if season >= rec[0]
|
115
|
+
return strip_geo( rec[1][1][1][0] ) # use second arg
|
116
|
+
end
|
117
|
+
end
|
118
|
+
## if we get here use last name
|
119
|
+
strip_geo( recs[-1][1][1][0][0] ) # use first arg
|
120
|
+
else
|
121
|
+
nil
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
##################
|
126
|
+
## helpers
|
127
|
+
def strip_geo( name )
|
128
|
+
## e.g. Arsenal, London => Arsenal
|
129
|
+
name.split(',')[0].strip
|
130
|
+
end
|
131
|
+
end # class ClubHistoryIndex
|
132
|
+
|
133
|
+
end # module Import
|
134
|
+
end # module SportDb
|
@@ -0,0 +1,203 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class ClubHistoryReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
15
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
16
|
+
parse( txt )
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parse( txt )
|
20
|
+
new( txt ).parse
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize( txt )
|
24
|
+
@txt = txt
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
###
|
29
|
+
## RENAME/RENAMED
|
30
|
+
## MOVE/MOVED
|
31
|
+
## BANKRUPT/BANKRUPTED
|
32
|
+
## REFORM/REFORMED
|
33
|
+
## MERGE/MERGED - allow + or ++ or +++ or ; for "inline" - why? why not?
|
34
|
+
|
35
|
+
|
36
|
+
KEYWORD_LINE_RE = %r{ ^(?<keyword>RENAMED?|
|
37
|
+
MOVED?|
|
38
|
+
BANKRUPT(?:ED)?|
|
39
|
+
REFORM(?:ED)?|
|
40
|
+
MERGED?
|
41
|
+
)
|
42
|
+
[ ]+
|
43
|
+
(?<text>.*) # rest of text
|
44
|
+
$
|
45
|
+
}x
|
46
|
+
|
47
|
+
|
48
|
+
def parse
|
49
|
+
recs = []
|
50
|
+
last_rec = nil
|
51
|
+
|
52
|
+
last_country = nil
|
53
|
+
last_season = nil
|
54
|
+
last_keyword = nil
|
55
|
+
last_teams = []
|
56
|
+
|
57
|
+
OutlineReader.parse( @txt ).each do |node|
|
58
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
59
|
+
heading_level = node[0][1].to_i
|
60
|
+
heading = node[1]
|
61
|
+
|
62
|
+
puts "heading #{heading_level} >#{heading}<"
|
63
|
+
|
64
|
+
|
65
|
+
if heading_level == 1
|
66
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
67
|
+
## Österreich • Austria (at)
|
68
|
+
## Österreich • Austria
|
69
|
+
## Austria
|
70
|
+
## Deutschland (de) • Germany
|
71
|
+
country = catalog.countries.parse( heading )
|
72
|
+
## check country code - MUST exist for now!!!!
|
73
|
+
if country.nil?
|
74
|
+
puts "!!! error [club history reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
75
|
+
exit 1
|
76
|
+
end
|
77
|
+
puts " country >#{heading}< => #{country.name}, #{country.key}"
|
78
|
+
last_country = country
|
79
|
+
last_season = nil ## reset "lower levels" - season & keyword
|
80
|
+
last_keyword = nil
|
81
|
+
elsif heading_level == 2
|
82
|
+
## assume season
|
83
|
+
season = Season.parse( heading )
|
84
|
+
puts " season >#{heading}< => #{season.key}"
|
85
|
+
last_season = season ## reset "lowwer levels" - keyword
|
86
|
+
last_keyword = nil
|
87
|
+
else
|
88
|
+
puts "!!! ERROR [club history reader] - for now only heading 1 & 2 supported; sorry"
|
89
|
+
exit 1
|
90
|
+
end
|
91
|
+
|
92
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
93
|
+
if last_country.nil?
|
94
|
+
puts "!!! ERROR [club history reader] - country heading 1 required, sorry"
|
95
|
+
exit 1
|
96
|
+
end
|
97
|
+
if last_season.nil?
|
98
|
+
puts "!!! ERROR [club history reader] - season heading 2 required, sorry"
|
99
|
+
exit 1
|
100
|
+
end
|
101
|
+
|
102
|
+
lines = node[1]
|
103
|
+
lines.each do |line|
|
104
|
+
if m=line.match(KEYWORD_LINE_RE) ## extract keyword and continue
|
105
|
+
keyword = m[:keyword]
|
106
|
+
line = m[:text].strip
|
107
|
+
|
108
|
+
puts " keyword #{keyword}"
|
109
|
+
last_keyword = case keyword ## "normalize" keywords
|
110
|
+
when 'BANKRUPT', 'BANKRUPTED'
|
111
|
+
'BANKRUPT'
|
112
|
+
when 'RENAME', 'RENAMED'
|
113
|
+
'RENAME'
|
114
|
+
when 'REFORM', 'REFORMED'
|
115
|
+
'REFORM'
|
116
|
+
when 'MOVE', 'MOVED'
|
117
|
+
'MOVE'
|
118
|
+
when 'MERGE', 'MERGED'
|
119
|
+
'MERGE'
|
120
|
+
else
|
121
|
+
puts "!!! ERROR [club history reader] - unexpected keyword >#{keyword}<; sorry - don't know how to normalize"
|
122
|
+
exit 1
|
123
|
+
end
|
124
|
+
|
125
|
+
last_teams = []
|
126
|
+
end
|
127
|
+
|
128
|
+
if last_keyword.nil?
|
129
|
+
puts "!!! ERROR [club history reader] - line with keyword expected - got:"
|
130
|
+
puts line
|
131
|
+
exit 1
|
132
|
+
end
|
133
|
+
|
134
|
+
if last_keyword == 'BANKRUPT'
|
135
|
+
## requires / expects one team in one line
|
136
|
+
recs << [ last_keyword, last_season.key,
|
137
|
+
[ squish(line), last_country.key ]
|
138
|
+
]
|
139
|
+
elsif last_keyword == 'RENAME' ||
|
140
|
+
last_keyword == 'REFORM' ||
|
141
|
+
last_keyword == 'MOVE'
|
142
|
+
## requires / expects two teams in one line (separated by ⇒ or such)
|
143
|
+
teams = line.split( '⇒' )
|
144
|
+
if teams.size != 2
|
145
|
+
puts "!!! ERROR [club history reader] - expected two teams - got:"
|
146
|
+
pp teams
|
147
|
+
exit 1
|
148
|
+
end
|
149
|
+
teams = teams.map {|team| squish(team.strip) } ## remove whitespaces
|
150
|
+
recs << [ last_keyword, last_season.key,
|
151
|
+
[ teams[0], last_country.key ],
|
152
|
+
[ teams[1], last_country.key ]
|
153
|
+
]
|
154
|
+
elsif last_keyword == 'MERGE'
|
155
|
+
## check if line starts with separator
|
156
|
+
## otherwise collect to be merged teams
|
157
|
+
if line.start_with?( '⇒' )
|
158
|
+
if last_teams.size < 2
|
159
|
+
puts "!!! ERROR [club history reader] - expected two or more teams for MERGE - got:"
|
160
|
+
pp last_teams
|
161
|
+
exit 1
|
162
|
+
end
|
163
|
+
## auto-add country to all teams
|
164
|
+
teams = last_teams.map {|team| [team, last_country.key]}
|
165
|
+
recs << [ last_keyword, last_season.key,
|
166
|
+
teams,
|
167
|
+
[ squish(line.sub('⇒','').strip), last_country.key ]
|
168
|
+
]
|
169
|
+
|
170
|
+
last_teams = []
|
171
|
+
else
|
172
|
+
last_teams << squish(line)
|
173
|
+
end
|
174
|
+
else
|
175
|
+
puts "!!! ERROR [club history reader] - unknown keyword >#{last_keyword}<; cannot process; sorry"
|
176
|
+
exit 1
|
177
|
+
end
|
178
|
+
end # each line (in paragraph)
|
179
|
+
else
|
180
|
+
puts "** !!! ERROR [club history reader] - unknown line type:"
|
181
|
+
pp node
|
182
|
+
exit 1
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
recs
|
187
|
+
end # method read
|
188
|
+
|
189
|
+
|
190
|
+
###############
|
191
|
+
## helper
|
192
|
+
|
193
|
+
def squish( str )
|
194
|
+
## colapse all whitespace to one
|
195
|
+
str.gsub( /[ ]+/,' ' )
|
196
|
+
end
|
197
|
+
|
198
|
+
|
199
|
+
end # class ClubHistoryReader
|
200
|
+
|
201
|
+
|
202
|
+
end ## module Import
|
203
|
+
end ## module SportDb
|