sportdb-formats 1.0.6 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +6 -33
- data/Rakefile +2 -5
- data/lib/sportdb/formats.rb +54 -70
- data/lib/sportdb/formats/country/country_index.rb +2 -2
- data/lib/sportdb/formats/event/event_index.rb +141 -0
- data/lib/sportdb/formats/event/event_reader.rb +183 -0
- data/lib/sportdb/formats/league/league_index.rb +22 -18
- data/lib/sportdb/formats/league/league_outline_reader.rb +45 -13
- data/lib/sportdb/formats/league/league_reader.rb +7 -1
- data/lib/sportdb/formats/match/match_parser.rb +101 -111
- data/lib/sportdb/formats/package.rb +59 -11
- data/lib/sportdb/formats/parser_helper.rb +11 -2
- data/lib/sportdb/formats/team/club_index.rb +13 -11
- data/lib/sportdb/formats/team/club_index_history.rb +134 -0
- data/lib/sportdb/formats/team/club_reader_history.rb +203 -0
- data/lib/sportdb/formats/team/club_reader_props.rb +20 -5
- data/lib/sportdb/formats/version.rb +2 -2
- data/test/helper.rb +51 -81
- data/test/test_club_index_history.rb +107 -0
- data/test/test_club_reader_history.rb +212 -0
- data/test/test_datafile_package.rb +1 -1
- data/test/test_regex.rb +25 -7
- metadata +9 -78
- data/lib/sportdb/formats/config.rb +0 -40
- data/lib/sportdb/formats/match/match_parser_csv.rb +0 -314
- data/lib/sportdb/formats/name_helper.rb +0 -84
- data/lib/sportdb/formats/score/score_formats.rb +0 -220
- data/lib/sportdb/formats/score/score_parser.rb +0 -202
- data/lib/sportdb/formats/season_utils.rb +0 -27
- data/lib/sportdb/formats/structs/country.rb +0 -31
- data/lib/sportdb/formats/structs/group.rb +0 -18
- data/lib/sportdb/formats/structs/league.rb +0 -37
- data/lib/sportdb/formats/structs/match.rb +0 -151
- data/lib/sportdb/formats/structs/matchlist.rb +0 -220
- data/lib/sportdb/formats/structs/round.rb +0 -25
- data/lib/sportdb/formats/structs/season.rb +0 -123
- data/lib/sportdb/formats/structs/standings.rb +0 -247
- data/lib/sportdb/formats/structs/team.rb +0 -150
- data/lib/sportdb/formats/structs/team_usage.rb +0 -88
- data/test/test_clubs.rb +0 -40
- data/test/test_conf.rb +0 -65
- data/test/test_csv_match_parser.rb +0 -114
- data/test/test_csv_match_parser_utils.rb +0 -20
- data/test/test_csv_reader.rb +0 -31
- data/test/test_match.rb +0 -30
- data/test/test_match_auto.rb +0 -72
- data/test/test_match_auto_champs.rb +0 -45
- data/test/test_match_auto_euro.rb +0 -37
- data/test/test_match_auto_worldcup.rb +0 -61
- data/test/test_match_champs.rb +0 -27
- data/test/test_match_eng.rb +0 -26
- data/test/test_match_euro.rb +0 -27
- data/test/test_match_worldcup.rb +0 -27
- data/test/test_name_helper.rb +0 -67
- data/test/test_scores.rb +0 -122
- data/test/test_season.rb +0 -62
@@ -13,12 +13,22 @@ module SportDb
|
|
13
13
|
## leagues.txt or leagues_en.txt
|
14
14
|
## remove support for en.leagues.txt - why? why not?
|
15
15
|
LEAGUES_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
16
|
-
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.
|
16
|
+
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.leagues.txt
|
17
17
|
leagues
|
18
18
|
(?:_[a-z0-9_-]+)?
|
19
19
|
\.txt$
|
20
20
|
}x
|
21
21
|
|
22
|
+
## seasons.txt or seasons_en.txt
|
23
|
+
## remove support for br.seasons.txt - why? why not?
|
24
|
+
SEASONS_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
25
|
+
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.seasons.txt
|
26
|
+
seasons
|
27
|
+
(?:_[a-z0-9_-]+)?
|
28
|
+
\.txt$
|
29
|
+
}x
|
30
|
+
|
31
|
+
|
22
32
|
## clubs.txt or clubs_en.txt
|
23
33
|
## remove support for en.clubs.txt - why? why not?
|
24
34
|
CLUBS_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
@@ -35,12 +45,22 @@ module SportDb
|
|
35
45
|
\.wiki\.txt$
|
36
46
|
}x
|
37
47
|
|
38
|
-
|
48
|
+
## todo/fix: rename to CLUBS too e.g. CLUBS_PROPS to reflect filename - why? why not?
|
49
|
+
CLUBS_PROPS_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
39
50
|
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.clubs.props.txt
|
40
51
|
clubs
|
41
52
|
(?:_[a-z0-9_-]+)?
|
42
53
|
\.props\.txt$
|
43
54
|
}x
|
55
|
+
CLUB_PROPS_RE = CLUBS_PROPS_RE ## add alias for now (fix later - why? why not?)
|
56
|
+
|
57
|
+
|
58
|
+
CLUBS_HISTORY_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
59
|
+
(?: [a-z]{1,4}\. )? # optional country code/key e.g. eng.clubs.history.txt
|
60
|
+
clubs
|
61
|
+
(?:_[a-z0-9_-]+)?
|
62
|
+
\.history\.txt$
|
63
|
+
}x
|
44
64
|
|
45
65
|
## teams.txt or teams_history.txt
|
46
66
|
TEAMS_RE = %r{ (?: ^|/ ) # beginning (^) or beginning of path (/)
|
@@ -49,6 +69,8 @@ module SportDb
|
|
49
69
|
\.txt$
|
50
70
|
}x
|
51
71
|
|
72
|
+
|
73
|
+
### todo/fix: change SEASON_RE to SEASON_KEY_RE (avoid confusion w/ SEASONS_RE for datafile?) - why? why not? !!!!!!!
|
52
74
|
### season folder:
|
53
75
|
## e.g. /2019-20 or
|
54
76
|
## year-only e.g. /2019 or
|
@@ -73,6 +95,10 @@ module SportDb
|
|
73
95
|
/[a-z0-9_.-]+\.csv$ ## note: allow dot (.) too e.g /eng.1.csv
|
74
96
|
}x
|
75
97
|
|
98
|
+
### add "generic" pattern to find all csv datafiles
|
99
|
+
CSV_RE = %r{ (?: ^|/ )
|
100
|
+
[a-z0-9_.-]+\.csv$ ## note: allow dot (.) too e.g /eng.1.csv
|
101
|
+
}x
|
76
102
|
|
77
103
|
|
78
104
|
## move class-level "static" finders to DirPackage (do NOT work for now for zip packages) - why? why not?
|
@@ -96,16 +122,22 @@ module SportDb
|
|
96
122
|
def self.find_teams( path, pattern: TEAMS_RE ) find( path, pattern ); end
|
97
123
|
def self.match_teams( path ) TEAMS_RE.match( path ); end
|
98
124
|
|
99
|
-
def self.find_clubs( path, pattern: CLUBS_RE )
|
100
|
-
def self.find_clubs_wiki( path, pattern: CLUBS_WIKI_RE )
|
125
|
+
def self.find_clubs( path, pattern: CLUBS_RE ) find( path, pattern ); end
|
126
|
+
def self.find_clubs_wiki( path, pattern: CLUBS_WIKI_RE ) find( path, pattern ); end
|
127
|
+
def self.find_clubs_history( path, pattern: CLUBS_HISTORY_RE ) find( path, pattern ); end
|
101
128
|
|
102
|
-
def self.match_clubs( path )
|
103
|
-
def self.match_clubs_wiki( path )
|
104
|
-
def self.
|
129
|
+
def self.match_clubs( path ) CLUBS_RE.match( path ); end
|
130
|
+
def self.match_clubs_wiki( path ) CLUBS_WIKI_RE.match( path ); end
|
131
|
+
def self.match_clubs_history( path ) CLUBS_HISTORY_RE.match( path); end
|
132
|
+
def self.match_clubs_props( path, pattern: CLUBS_PROPS_RE ) pattern.match( path ); end
|
105
133
|
|
106
134
|
def self.find_leagues( path, pattern: LEAGUES_RE ) find( path, pattern ); end
|
107
135
|
def self.match_leagues( path ) LEAGUES_RE.match( path ); end
|
108
136
|
|
137
|
+
def self.find_seasons( path, pattern: SEASONS_RE ) find( path, pattern ); end
|
138
|
+
def self.match_seasons( path ) SEASONS_RE.match( path ); end
|
139
|
+
|
140
|
+
|
109
141
|
def self.find_conf( path, pattern: CONF_RE ) find( path, pattern ); end
|
110
142
|
def self.match_conf( path ) CONF_RE.match( path ); end
|
111
143
|
|
@@ -118,6 +150,7 @@ module SportDb
|
|
118
150
|
end
|
119
151
|
## add match_match and match_match_csv - why? why not?
|
120
152
|
|
153
|
+
|
121
154
|
class << self
|
122
155
|
alias_method :match_teams?, :match_teams
|
123
156
|
alias_method :teams?, :match_teams
|
@@ -128,12 +161,21 @@ module SportDb
|
|
128
161
|
alias_method :match_clubs_wiki?, :match_clubs_wiki
|
129
162
|
alias_method :clubs_wiki?, :match_clubs_wiki
|
130
163
|
|
131
|
-
alias_method :
|
132
|
-
alias_method :
|
164
|
+
alias_method :match_clubs_history?, :match_clubs_history
|
165
|
+
alias_method :clubs_history?, :match_clubs_history
|
166
|
+
|
167
|
+
alias_method :match_club_props, :match_clubs_props
|
168
|
+
alias_method :match_club_props?, :match_clubs_props
|
169
|
+
alias_method :club_props?, :match_clubs_props
|
170
|
+
alias_method :match_clubs_props?, :match_clubs_props
|
171
|
+
alias_method :clubs_props?, :match_clubs_props
|
133
172
|
|
134
173
|
alias_method :match_leagues?, :match_leagues
|
135
174
|
alias_method :leagues?, :match_leagues
|
136
175
|
|
176
|
+
alias_method :match_seasons?, :match_seasons
|
177
|
+
alias_method :seasons?, :match_seasons
|
178
|
+
|
137
179
|
alias_method :match_conf?, :match_conf
|
138
180
|
alias_method :conf?, :match_conf
|
139
181
|
end
|
@@ -212,11 +254,17 @@ module SportDb
|
|
212
254
|
end
|
213
255
|
end
|
214
256
|
def each_match_csv( &blk ) each( pattern: MATCH_CSV_RE, &blk ); end
|
257
|
+
def each_csv( &blk ) each( pattern: CSV_RE, &blk ); end
|
258
|
+
|
215
259
|
def each_club_props( &blk ) each( pattern: CLUB_PROPS_RE, &blk ); end
|
216
260
|
|
217
261
|
def each_leagues( &blk ) each( pattern: LEAGUES_RE, &blk ); end
|
218
262
|
def each_clubs( &blk ) each( pattern: CLUBS_RE, &blk ); end
|
219
263
|
def each_clubs_wiki( &blk ) each( pattern: CLUBS_WIKI_RE, &blk ); end
|
264
|
+
def each_clubs_history( &blk ) each( pattern: CLUBS_HISTORY_RE, &blk ); end
|
265
|
+
|
266
|
+
def each_seasons( &blk ) each( pattern: SEASONS_RE, &blk ); end
|
267
|
+
|
220
268
|
|
221
269
|
## return all match datafile entries
|
222
270
|
def match( format: 'txt' )
|
@@ -287,13 +335,13 @@ module SportDb
|
|
287
335
|
## filter.skip? filter.include? ( season_sason_key )?
|
288
336
|
## fiteer.before?( season_key ) etc.
|
289
337
|
## find some good method names!!!!
|
290
|
-
season_start = start ?
|
338
|
+
season_start = start ? Season( start ) : nil
|
291
339
|
|
292
340
|
h = {}
|
293
341
|
match( format: format ).each do |entry|
|
294
342
|
## note: assume last directory in datafile path is the season part/key
|
295
343
|
season_q = File.basename( File.dirname( entry.name ))
|
296
|
-
season =
|
344
|
+
season = Season.parse( season_q ) ## normalize season
|
297
345
|
|
298
346
|
## skip if start season before this season
|
299
347
|
next if season_start && season_start.start_year > season.start_year
|
@@ -18,10 +18,19 @@ module SportDb
|
|
18
18
|
|
19
19
|
|
20
20
|
def is_round?( line )
|
21
|
-
## note: =~
|
22
|
-
|
21
|
+
## note: =~ returns nil if not match found, and 0,1, etc for match
|
22
|
+
|
23
|
+
## note: allow "free standing" leg 1 and leg 2 too
|
24
|
+
## (e.g. Hinspiel, Rückspiel etc. used for now in Relegation, for example)
|
25
|
+
## note ONLY allowed if "free standing", that is, full line with nothing else
|
26
|
+
## use "custom" regex for special case for now
|
27
|
+
## avoids match HIN in PascHINg, for example (hin in german for leg 1)
|
28
|
+
line =~ SportDb.lang.regex_round ||
|
29
|
+
line =~ /^(#{SportDb.lang.leg1})$/i ||
|
30
|
+
line =~ /^(#{SportDb.lang.leg2})$/i
|
23
31
|
end
|
24
32
|
|
33
|
+
|
25
34
|
def is_knockout_round?( line )
|
26
35
|
|
27
36
|
## todo: check for adding ignore case for regex (e.g. 1st leg/1st Leg)
|
@@ -54,6 +54,7 @@ class ClubIndex
|
|
54
54
|
## normalize( name )
|
55
55
|
|
56
56
|
def strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
|
57
|
+
## change/rename to strip_wiki_qualifier or such - why? why not?
|
57
58
|
## note: strip disambiguationn qualifier from wikipedia page name if present
|
58
59
|
## note: only remove year and foot... for now
|
59
60
|
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
@@ -178,22 +179,24 @@ class ClubIndex
|
|
178
179
|
## todo/fix/check: use rename to find_canon or find_canonical() or something??
|
179
180
|
## remove (getting used?) - why? why not?
|
180
181
|
def []( name ) ## lookup by canoncial name only; todo/fix: add find alias why? why not?
|
182
|
+
puts "WARN!! do not use ClubIndex#[] for lookup >#{name}< - will get removed!!!"
|
181
183
|
@clubs[ name ]
|
182
184
|
end
|
183
185
|
|
184
186
|
|
185
|
-
## todo/fix/check: return empty array if no match!!!
|
186
|
-
## and NOT nil (add || []) - why? why not?
|
187
187
|
def match( name )
|
188
|
+
# note: returns empty array (e.g. []) if no match and NOT nil
|
188
189
|
name = normalize( name )
|
189
|
-
m = @clubs_by_name[ name ]
|
190
|
+
m = @clubs_by_name[ name ] || []
|
190
191
|
|
191
192
|
## no match - retry with unaccented variant if different
|
192
193
|
## e.g. example is Preussen Münster (with mixed accent and unaccented letters) that would go unmatched for now
|
193
194
|
## Preussen Münster => preussenmünster (norm) => preussenmunster (norm+unaccent)
|
194
|
-
if m.
|
195
|
+
if m.empty?
|
195
196
|
name2 = unaccent( name )
|
196
|
-
|
197
|
+
if name2 != name
|
198
|
+
m = @clubs_by_name[ name2 ] || []
|
199
|
+
end
|
197
200
|
end
|
198
201
|
m
|
199
202
|
end
|
@@ -227,10 +230,8 @@ class ClubIndex
|
|
227
230
|
country = country( country )
|
228
231
|
|
229
232
|
## note: match must for now always include name
|
230
|
-
|
231
|
-
|
232
|
-
m = nil if m.empty? ## note: reset to nil if no more matches
|
233
|
-
end
|
233
|
+
## filter by country
|
234
|
+
m = m.select { |club| club.country.key == country.key }
|
234
235
|
end
|
235
236
|
m
|
236
237
|
end
|
@@ -263,7 +264,7 @@ class ClubIndex
|
|
263
264
|
|
264
265
|
m = match_by( name: name, country: country )
|
265
266
|
|
266
|
-
if m.
|
267
|
+
if m.empty?
|
267
268
|
## (re)try with second country - quick hacks for known leagues
|
268
269
|
## todo/fix: add league flag to activate!!! - why? why not
|
269
270
|
m = match_by( name: name, country: 'wal' ) if country.key == 'eng'
|
@@ -272,6 +273,7 @@ class ClubIndex
|
|
272
273
|
m = match_by( name: name, country: 'mc' ) if country.key == 'fr'
|
273
274
|
m = match_by( name: name, country: 'li' ) if country.key == 'ch'
|
274
275
|
m = match_by( name: name, country: 'ca' ) if country.key == 'us'
|
276
|
+
m = match_by( name: name, country: 'nz' ) if country.key == 'au'
|
275
277
|
end
|
276
278
|
else ## try "global" search - no country passed in
|
277
279
|
m = match( name )
|
@@ -279,7 +281,7 @@ class ClubIndex
|
|
279
281
|
|
280
282
|
|
281
283
|
club = nil
|
282
|
-
if m.
|
284
|
+
if m.empty?
|
283
285
|
## puts "** !!! WARN !!! no match for club >#{name}<"
|
284
286
|
elsif m.size > 1
|
285
287
|
puts "** !!! ERROR - too many matches (#{m.size}) for club >#{name}<:"
|
@@ -0,0 +1,134 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
|
7
|
+
class ClubHistoryIndex
|
8
|
+
|
9
|
+
def self.build( path )
|
10
|
+
pack = Package.new( path ) ## lets us use direcotry or zip archive
|
11
|
+
|
12
|
+
recs = []
|
13
|
+
pack.each_clubs_history do |entry|
|
14
|
+
recs += ClubHistoryReader.parse( entry.read )
|
15
|
+
end
|
16
|
+
recs
|
17
|
+
|
18
|
+
index = new
|
19
|
+
index.add( recs )
|
20
|
+
index
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
def catalog() Import.catalog; end
|
26
|
+
|
27
|
+
## note: keep name history for now separate from
|
28
|
+
## from club struct - why? why not?
|
29
|
+
## later yes, yes, yes, merge name history into club struct!!!!!
|
30
|
+
##
|
31
|
+
## for now the name history is experimental
|
32
|
+
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@clubs = {} ## clubs (indexed) by canonical name
|
36
|
+
@errors = []
|
37
|
+
end
|
38
|
+
|
39
|
+
attr_reader :errors
|
40
|
+
def errors?() @errors.empty? == false; end
|
41
|
+
|
42
|
+
def mappings() @clubs; end ## todo/check: rename to records or histories or something - why? why not?
|
43
|
+
|
44
|
+
|
45
|
+
def add_history( club_rec, keyword, season, args )
|
46
|
+
## note use season obj for now (and NOT key) - why? why not?
|
47
|
+
rec = @clubs[ club_rec.name ] ||= []
|
48
|
+
|
49
|
+
rec << [season, [keyword, args]]
|
50
|
+
|
51
|
+
## note: always keep records sorted by season_key for now
|
52
|
+
## check if 2010 and 2010/11 is in order using alpha sort?? (see argentina)
|
53
|
+
rec.sort! { |l,r| r[0] <=> l[0] }
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def add( rec_or_recs ) ## add club record / alt_names
|
58
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
59
|
+
|
60
|
+
recs.each do |rec|
|
61
|
+
|
62
|
+
keyword = rec[0]
|
63
|
+
season_key = rec[1]
|
64
|
+
args = rec[2..-1] ## get rest of args e.g. one, two or more
|
65
|
+
|
66
|
+
## note: for now only add (re)name history season records,
|
67
|
+
## that is, skip MERGE and BANKRUPT for now
|
68
|
+
## and incl. only RENAME, REFORM, MOVE for now
|
69
|
+
next if ['MERGE', 'BANKRUPT'].include?( keyword )
|
70
|
+
|
71
|
+
|
72
|
+
name_old = strip_geo( args[0][0] ) ## note: strip optional geo part from name
|
73
|
+
name_new = strip_geo( args[1][0] )
|
74
|
+
|
75
|
+
country_old = args[0][1]
|
76
|
+
country_new = args[1][1]
|
77
|
+
|
78
|
+
club_old = catalog.clubs.find_by!( name: name_old, country: country_old )
|
79
|
+
club_new = catalog.clubs.find_by!( name: name_new, country: country_new )
|
80
|
+
|
81
|
+
## note use season obj for now (and NOT key) - why? why not?
|
82
|
+
season = Season.parse( season_key )
|
83
|
+
|
84
|
+
## todo/check:
|
85
|
+
## check if club_old and club_new reference different club record!!
|
86
|
+
## examples - RB II -> Liefering ?? or
|
87
|
+
## FC Pasching -> OOE Juniors ??
|
88
|
+
## Austria Salzburg -> RB Salburg ??
|
89
|
+
## for now always add name history to both - why? why not?
|
90
|
+
|
91
|
+
add_history( club_old, keyword, season, args )
|
92
|
+
## note: allow for now different club references
|
93
|
+
## but maybe warn later - why? why not?
|
94
|
+
## add history to both for now
|
95
|
+
add_history( club_new, keyword, season, args ) if club_old != club_new
|
96
|
+
end # each rec
|
97
|
+
end # method add
|
98
|
+
|
99
|
+
|
100
|
+
#### todo/check: move as method to club struct later - to always use club reference
|
101
|
+
## returns (simply) name as string for now or nil - why? why not?
|
102
|
+
#
|
103
|
+
# history entry example
|
104
|
+
# Arsenal FC"=>
|
105
|
+
# [[1927/28, ["RENAME", [["The Arsenal FC, London", "eng"], ["Arsenal FC", "eng"]]]],
|
106
|
+
# [1914/15, ["RENAME", [["Woolwich Arsenal FC, London", "eng"], ["The Arsenal FC", "eng"]]]],
|
107
|
+
# [1892/93, ["RENAME", [["Royal Arsenal FC, London", "eng"], ["Woolwich Arsenal FC", "eng"]]]]],
|
108
|
+
def find_name_by( name:, season: )
|
109
|
+
recs = @clubs[ name ]
|
110
|
+
if recs
|
111
|
+
season = Season( season ) ## make sure season is a season obj (and NOT a string)
|
112
|
+
## check season records for name; use linear search (assume only few records)
|
113
|
+
recs.each do |rec|
|
114
|
+
if season >= rec[0]
|
115
|
+
return strip_geo( rec[1][1][1][0] ) # use second arg
|
116
|
+
end
|
117
|
+
end
|
118
|
+
## if we get here use last name
|
119
|
+
strip_geo( recs[-1][1][1][0][0] ) # use first arg
|
120
|
+
else
|
121
|
+
nil
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
##################
|
126
|
+
## helpers
|
127
|
+
def strip_geo( name )
|
128
|
+
## e.g. Arsenal, London => Arsenal
|
129
|
+
name.split(',')[0].strip
|
130
|
+
end
|
131
|
+
end # class ClubHistoryIndex
|
132
|
+
|
133
|
+
end # module Import
|
134
|
+
end # module SportDb
|
@@ -0,0 +1,203 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class ClubHistoryReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
15
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
16
|
+
parse( txt )
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parse( txt )
|
20
|
+
new( txt ).parse
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize( txt )
|
24
|
+
@txt = txt
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
###
|
29
|
+
## RENAME/RENAMED
|
30
|
+
## MOVE/MOVED
|
31
|
+
## BANKRUPT/BANKRUPTED
|
32
|
+
## REFORM/REFORMED
|
33
|
+
## MERGE/MERGED - allow + or ++ or +++ or ; for "inline" - why? why not?
|
34
|
+
|
35
|
+
|
36
|
+
KEYWORD_LINE_RE = %r{ ^(?<keyword>RENAMED?|
|
37
|
+
MOVED?|
|
38
|
+
BANKRUPT(?:ED)?|
|
39
|
+
REFORM(?:ED)?|
|
40
|
+
MERGED?
|
41
|
+
)
|
42
|
+
[ ]+
|
43
|
+
(?<text>.*) # rest of text
|
44
|
+
$
|
45
|
+
}x
|
46
|
+
|
47
|
+
|
48
|
+
def parse
|
49
|
+
recs = []
|
50
|
+
last_rec = nil
|
51
|
+
|
52
|
+
last_country = nil
|
53
|
+
last_season = nil
|
54
|
+
last_keyword = nil
|
55
|
+
last_teams = []
|
56
|
+
|
57
|
+
OutlineReader.parse( @txt ).each do |node|
|
58
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
59
|
+
heading_level = node[0][1].to_i
|
60
|
+
heading = node[1]
|
61
|
+
|
62
|
+
puts "heading #{heading_level} >#{heading}<"
|
63
|
+
|
64
|
+
|
65
|
+
if heading_level == 1
|
66
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
67
|
+
## Österreich • Austria (at)
|
68
|
+
## Österreich • Austria
|
69
|
+
## Austria
|
70
|
+
## Deutschland (de) • Germany
|
71
|
+
country = catalog.countries.parse( heading )
|
72
|
+
## check country code - MUST exist for now!!!!
|
73
|
+
if country.nil?
|
74
|
+
puts "!!! error [club history reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
75
|
+
exit 1
|
76
|
+
end
|
77
|
+
puts " country >#{heading}< => #{country.name}, #{country.key}"
|
78
|
+
last_country = country
|
79
|
+
last_season = nil ## reset "lower levels" - season & keyword
|
80
|
+
last_keyword = nil
|
81
|
+
elsif heading_level == 2
|
82
|
+
## assume season
|
83
|
+
season = Season.parse( heading )
|
84
|
+
puts " season >#{heading}< => #{season.key}"
|
85
|
+
last_season = season ## reset "lowwer levels" - keyword
|
86
|
+
last_keyword = nil
|
87
|
+
else
|
88
|
+
puts "!!! ERROR [club history reader] - for now only heading 1 & 2 supported; sorry"
|
89
|
+
exit 1
|
90
|
+
end
|
91
|
+
|
92
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
93
|
+
if last_country.nil?
|
94
|
+
puts "!!! ERROR [club history reader] - country heading 1 required, sorry"
|
95
|
+
exit 1
|
96
|
+
end
|
97
|
+
if last_season.nil?
|
98
|
+
puts "!!! ERROR [club history reader] - season heading 2 required, sorry"
|
99
|
+
exit 1
|
100
|
+
end
|
101
|
+
|
102
|
+
lines = node[1]
|
103
|
+
lines.each do |line|
|
104
|
+
if m=line.match(KEYWORD_LINE_RE) ## extract keyword and continue
|
105
|
+
keyword = m[:keyword]
|
106
|
+
line = m[:text].strip
|
107
|
+
|
108
|
+
puts " keyword #{keyword}"
|
109
|
+
last_keyword = case keyword ## "normalize" keywords
|
110
|
+
when 'BANKRUPT', 'BANKRUPTED'
|
111
|
+
'BANKRUPT'
|
112
|
+
when 'RENAME', 'RENAMED'
|
113
|
+
'RENAME'
|
114
|
+
when 'REFORM', 'REFORMED'
|
115
|
+
'REFORM'
|
116
|
+
when 'MOVE', 'MOVED'
|
117
|
+
'MOVE'
|
118
|
+
when 'MERGE', 'MERGED'
|
119
|
+
'MERGE'
|
120
|
+
else
|
121
|
+
puts "!!! ERROR [club history reader] - unexpected keyword >#{keyword}<; sorry - don't know how to normalize"
|
122
|
+
exit 1
|
123
|
+
end
|
124
|
+
|
125
|
+
last_teams = []
|
126
|
+
end
|
127
|
+
|
128
|
+
if last_keyword.nil?
|
129
|
+
puts "!!! ERROR [club history reader] - line with keyword expected - got:"
|
130
|
+
puts line
|
131
|
+
exit 1
|
132
|
+
end
|
133
|
+
|
134
|
+
if last_keyword == 'BANKRUPT'
|
135
|
+
## requires / expects one team in one line
|
136
|
+
recs << [ last_keyword, last_season.key,
|
137
|
+
[ squish(line), last_country.key ]
|
138
|
+
]
|
139
|
+
elsif last_keyword == 'RENAME' ||
|
140
|
+
last_keyword == 'REFORM' ||
|
141
|
+
last_keyword == 'MOVE'
|
142
|
+
## requires / expects two teams in one line (separated by ⇒ or such)
|
143
|
+
teams = line.split( '⇒' )
|
144
|
+
if teams.size != 2
|
145
|
+
puts "!!! ERROR [club history reader] - expected two teams - got:"
|
146
|
+
pp teams
|
147
|
+
exit 1
|
148
|
+
end
|
149
|
+
teams = teams.map {|team| squish(team.strip) } ## remove whitespaces
|
150
|
+
recs << [ last_keyword, last_season.key,
|
151
|
+
[ teams[0], last_country.key ],
|
152
|
+
[ teams[1], last_country.key ]
|
153
|
+
]
|
154
|
+
elsif last_keyword == 'MERGE'
|
155
|
+
## check if line starts with separator
|
156
|
+
## otherwise collect to be merged teams
|
157
|
+
if line.start_with?( '⇒' )
|
158
|
+
if last_teams.size < 2
|
159
|
+
puts "!!! ERROR [club history reader] - expected two or more teams for MERGE - got:"
|
160
|
+
pp last_teams
|
161
|
+
exit 1
|
162
|
+
end
|
163
|
+
## auto-add country to all teams
|
164
|
+
teams = last_teams.map {|team| [team, last_country.key]}
|
165
|
+
recs << [ last_keyword, last_season.key,
|
166
|
+
teams,
|
167
|
+
[ squish(line.sub('⇒','').strip), last_country.key ]
|
168
|
+
]
|
169
|
+
|
170
|
+
last_teams = []
|
171
|
+
else
|
172
|
+
last_teams << squish(line)
|
173
|
+
end
|
174
|
+
else
|
175
|
+
puts "!!! ERROR [club history reader] - unknown keyword >#{last_keyword}<; cannot process; sorry"
|
176
|
+
exit 1
|
177
|
+
end
|
178
|
+
end # each line (in paragraph)
|
179
|
+
else
|
180
|
+
puts "** !!! ERROR [club history reader] - unknown line type:"
|
181
|
+
pp node
|
182
|
+
exit 1
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
recs
|
187
|
+
end # method read
|
188
|
+
|
189
|
+
|
190
|
+
###############
|
191
|
+
## helper
|
192
|
+
|
193
|
+
def squish( str )
|
194
|
+
## colapse all whitespace to one
|
195
|
+
str.gsub( /[ ]+/,' ' )
|
196
|
+
end
|
197
|
+
|
198
|
+
|
199
|
+
end # class ClubHistoryReader
|
200
|
+
|
201
|
+
|
202
|
+
end ## module Import
|
203
|
+
end ## module SportDb
|