sportdb-formats 1.0.4 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +4 -0
- data/lib/sportdb/formats.rb +14 -0
- data/lib/sportdb/formats/country/country_reader.rb +1 -1
- data/lib/sportdb/formats/event/event_index.rb +143 -0
- data/lib/sportdb/formats/event/event_reader.rb +183 -0
- data/lib/sportdb/formats/league/league_outline_reader.rb +24 -7
- data/lib/sportdb/formats/match/mapper.rb +63 -63
- data/lib/sportdb/formats/match/mapper_teams.rb +1 -1
- data/lib/sportdb/formats/match/match_parser.rb +119 -183
- data/lib/sportdb/formats/match/match_parser_csv.rb +23 -6
- data/lib/sportdb/formats/package.rb +27 -1
- data/lib/sportdb/formats/parser_helper.rb +11 -2
- data/lib/sportdb/formats/score/score_parser.rb +6 -0
- data/lib/sportdb/formats/season_utils.rb +0 -11
- data/lib/sportdb/formats/structs/group.rb +5 -12
- data/lib/sportdb/formats/structs/match.rb +5 -1
- data/lib/sportdb/formats/structs/round.rb +6 -13
- data/lib/sportdb/formats/structs/season.rb +114 -45
- data/lib/sportdb/formats/structs/standings.rb +30 -9
- data/lib/sportdb/formats/structs/team.rb +1 -2
- data/lib/sportdb/formats/version.rb +2 -2
- data/test/helper.rb +1 -0
- data/test/test_country_reader.rb +2 -2
- data/test/test_match_auto_relegation.rb +41 -0
- data/test/test_match_start_date.rb +44 -0
- data/test/test_regex.rb +25 -7
- data/test/test_season.rb +68 -19
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 89c3ae173bed0c7a68f30a897e4cf34da8d72d6e
|
4
|
+
data.tar.gz: f3898e8b03177b62075cced2a90be2502555bf63
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c938af88f6dd86f1736532c9d8271a3e56f1769b955a31ce6d84a0b1fcf5a6eec7cfa42e0c019bd7072351d777a1988d6637c4297bee0dad9592bb8ec7450e8
|
7
|
+
data.tar.gz: a80df68f64e13eeb9a85931e2d2dd4381f204f94a4ea9254c89b4b28346db4c1542321aed4fd123aef5ddedc7e4cd2c08731dd340326fae5dc94b57fa6797835
|
data/Manifest.txt
CHANGED
@@ -8,6 +8,8 @@ lib/sportdb/formats/country/country_index.rb
|
|
8
8
|
lib/sportdb/formats/country/country_reader.rb
|
9
9
|
lib/sportdb/formats/datafile.rb
|
10
10
|
lib/sportdb/formats/datafile_package.rb
|
11
|
+
lib/sportdb/formats/event/event_index.rb
|
12
|
+
lib/sportdb/formats/event/event_reader.rb
|
11
13
|
lib/sportdb/formats/goals.rb
|
12
14
|
lib/sportdb/formats/league/league_index.rb
|
13
15
|
lib/sportdb/formats/league/league_outline_reader.rb
|
@@ -63,10 +65,12 @@ test/test_match.rb
|
|
63
65
|
test/test_match_auto.rb
|
64
66
|
test/test_match_auto_champs.rb
|
65
67
|
test/test_match_auto_euro.rb
|
68
|
+
test/test_match_auto_relegation.rb
|
66
69
|
test/test_match_auto_worldcup.rb
|
67
70
|
test/test_match_champs.rb
|
68
71
|
test/test_match_eng.rb
|
69
72
|
test/test_match_euro.rb
|
73
|
+
test/test_match_start_date.rb
|
70
74
|
test/test_match_worldcup.rb
|
71
75
|
test/test_name_helper.rb
|
72
76
|
test/test_outline_reader.rb
|
data/lib/sportdb/formats.rb
CHANGED
@@ -136,6 +136,20 @@ end # module Import
|
|
136
136
|
end # module SportDb
|
137
137
|
|
138
138
|
|
139
|
+
require 'sportdb/formats/event/event_reader'
|
140
|
+
require 'sportdb/formats/event/event_index'
|
141
|
+
|
142
|
+
## add convenience helper
|
143
|
+
module SportDb
|
144
|
+
module Import
|
145
|
+
class EventInfo
|
146
|
+
def self.read( path ) EventInfoReader.read( path ); end
|
147
|
+
def self.parse( txt ) EventInfoReader.parse( txt ); end
|
148
|
+
end # class EventInfo
|
149
|
+
end # module Import
|
150
|
+
end # module SportDb
|
151
|
+
|
152
|
+
|
139
153
|
|
140
154
|
|
141
155
|
|
@@ -57,7 +57,7 @@ def parse
|
|
57
57
|
|
58
58
|
last_country = country = Country.new( name: "#{name} (-#{year})",
|
59
59
|
code: code )
|
60
|
-
country.alt_names << name ## note: do NOT
|
60
|
+
## country.alt_names << name ## note: for now do NOT add name without year to alt_names - gets auto-add by index!!!
|
61
61
|
|
62
62
|
countries << country
|
63
63
|
## todo/fix: add reference to country today (in parts[1] !!!!)
|
@@ -0,0 +1,143 @@
|
|
1
|
+
module SportDb
|
2
|
+
module Import
|
3
|
+
|
4
|
+
|
5
|
+
class EventIndex
|
6
|
+
|
7
|
+
def self.build( path )
|
8
|
+
datafiles = Package.find_seasons( path )
|
9
|
+
|
10
|
+
puts
|
11
|
+
puts "#{datafiles.size} seasons datafile(s):"
|
12
|
+
pp datafiles
|
13
|
+
|
14
|
+
index = new
|
15
|
+
datafiles.each do |datafile|
|
16
|
+
recs = EventInfoReader.read( datafile )
|
17
|
+
# pp recs
|
18
|
+
|
19
|
+
index.add( recs )
|
20
|
+
end
|
21
|
+
|
22
|
+
index
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
attr_reader :events
|
27
|
+
def initialize
|
28
|
+
@events = []
|
29
|
+
@leagues = {}
|
30
|
+
end
|
31
|
+
|
32
|
+
def add( recs )
|
33
|
+
@events += recs ## add to "linear" records
|
34
|
+
|
35
|
+
recs.each do |rec|
|
36
|
+
league = rec.league
|
37
|
+
season = rec.season
|
38
|
+
|
39
|
+
seasons = @leagues[ league.key ] ||= {}
|
40
|
+
seasons[season.key] = rec
|
41
|
+
end
|
42
|
+
## build search index by leagues (and season)
|
43
|
+
end
|
44
|
+
|
45
|
+
def find_by( league:, season: )
|
46
|
+
league_key = league.is_a?( String ) ? league : league.key
|
47
|
+
season_key = season.is_a?( String ) ? season : season.key
|
48
|
+
|
49
|
+
seasons = @leagues[ league_key ]
|
50
|
+
if seasons
|
51
|
+
seasons[ season_key ]
|
52
|
+
else
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
end # method find_by
|
56
|
+
end ## class EventIndex
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
class SeasonIndex
|
61
|
+
def initialize( *args )
|
62
|
+
@leagues = {} ## use a league hash by years for now; change later
|
63
|
+
|
64
|
+
if args.size == 1 && args[0].is_a?( EventIndex )
|
65
|
+
## convenience setup/hookup
|
66
|
+
## (auto-)add all events from event index
|
67
|
+
add( args[0].events )
|
68
|
+
else
|
69
|
+
pp args
|
70
|
+
raise ArgumentError.new( 'unsupported arguments' )
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def add( recs )
|
75
|
+
## use a lookup index by year for now
|
76
|
+
## todo - find something better/more generic for searching/matching date periods!!!
|
77
|
+
recs.each do |rec|
|
78
|
+
league = rec.league
|
79
|
+
season = rec.season
|
80
|
+
|
81
|
+
years = @leagues[ league.key ] ||= {}
|
82
|
+
if season.year?
|
83
|
+
years[season.start_year] ||= []
|
84
|
+
years[season.start_year] << rec
|
85
|
+
else
|
86
|
+
years[season.start_year] ||= []
|
87
|
+
years[season.end_year] ||= []
|
88
|
+
years[season.start_year] << rec
|
89
|
+
years[season.end_year] << rec
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end # method add
|
93
|
+
|
94
|
+
def find_by( date:, league: )
|
95
|
+
date = Date.strptime( date, '%Y-%m-%d' ) if date.is_a?( String )
|
96
|
+
league_key = league.is_a?( String ) ? league : league.key
|
97
|
+
|
98
|
+
years = @leagues[ league_key ]
|
99
|
+
if years
|
100
|
+
year = years[ date.year ]
|
101
|
+
if year
|
102
|
+
season_key = nil
|
103
|
+
year.each do |event|
|
104
|
+
## todo/check: rename/use between? instead of include? - why? why not?
|
105
|
+
if event.include?( date )
|
106
|
+
season_key = event.season.key
|
107
|
+
break
|
108
|
+
end
|
109
|
+
end
|
110
|
+
if season_key.nil?
|
111
|
+
puts "!! WARN: date >#{date}< out-of-seasons for year #{date.year} in league #{league_key}:"
|
112
|
+
year.each do |event|
|
113
|
+
puts " #{event.season.key} | #{event.start_date} - #{event.end_date}"
|
114
|
+
end
|
115
|
+
## retry again and pick season with "overflow" at the end (date is great end_date)
|
116
|
+
year.each do |event|
|
117
|
+
if date > event.end_date
|
118
|
+
diff_in_days = date.to_date.jd - event.end_date.to_date.jd
|
119
|
+
puts " +#{diff_in_days} days - adding overflow to #{event.season.key} ending on #{event.end_date} ++ #{date}"
|
120
|
+
season_key = event.season.key
|
121
|
+
break
|
122
|
+
end
|
123
|
+
end
|
124
|
+
## exit now for sure - if still empty!!!!
|
125
|
+
if season_key.nil?
|
126
|
+
puts "!! ERROR: CANNOT auto-fix / (auto-)append date at the end of an event; check season setup - sorry"
|
127
|
+
exit 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
season_key
|
131
|
+
else
|
132
|
+
nil ## no year defined / found for league
|
133
|
+
end
|
134
|
+
else
|
135
|
+
nil ## no league defined / found
|
136
|
+
end
|
137
|
+
end # method find
|
138
|
+
|
139
|
+
end # class SeasonIndex
|
140
|
+
|
141
|
+
|
142
|
+
end # module Import
|
143
|
+
end # module SportDb
|
@@ -0,0 +1,183 @@
|
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
module Import
|
4
|
+
|
5
|
+
|
6
|
+
class EventInfo
|
7
|
+
## "high level" info (summary) about event (like a "wikipedia infobox")
|
8
|
+
## use for checking dataset imports; lets you check e.g.
|
9
|
+
## - dates within range
|
10
|
+
## - number of teams e.g. 20
|
11
|
+
## - matches played e.g. 380
|
12
|
+
## - goals scored e.g. 937
|
13
|
+
## etc.
|
14
|
+
|
15
|
+
attr_reader :league,
|
16
|
+
:season,
|
17
|
+
:teams,
|
18
|
+
:matches,
|
19
|
+
:goals,
|
20
|
+
:start_date,
|
21
|
+
:end_date
|
22
|
+
|
23
|
+
def initialize( league:, season:,
|
24
|
+
start_date: nil, end_date: nil,
|
25
|
+
teams: nil,
|
26
|
+
matches: nil,
|
27
|
+
goals: nil )
|
28
|
+
|
29
|
+
@league = league
|
30
|
+
@season = season
|
31
|
+
|
32
|
+
@start_date = start_date
|
33
|
+
@end_date = end_date
|
34
|
+
|
35
|
+
@teams = teams ## todo/check: rename/use teams_count ??
|
36
|
+
@matches = matches ## todo/check: rename/use match_count ??
|
37
|
+
@goals = goals
|
38
|
+
end
|
39
|
+
|
40
|
+
def include?( date )
|
41
|
+
## todo/fix: add options e.g.
|
42
|
+
## - add delta/off_by_one or such?
|
43
|
+
## - add strict (for) only return true if date range (really) defined (no generic auto-rules)
|
44
|
+
|
45
|
+
### note: for now allow off by one error (via timezone/local time errors)
|
46
|
+
## todo/fix: issue warning if off by one!!!!
|
47
|
+
if @start_date && @end_date
|
48
|
+
date >= (@start_date-1) &&
|
49
|
+
date <= (@end_date+1)
|
50
|
+
else
|
51
|
+
if @season.year?
|
52
|
+
# assume generic rule
|
53
|
+
## same year e.g. Jan 1 - Dec 31; always true for now
|
54
|
+
date.year == @season.start_year
|
55
|
+
else
|
56
|
+
# assume generic rule
|
57
|
+
## July 1 - June 30 (Y+1)
|
58
|
+
## - todo/check -start for some countries/leagues in June 1 or August 1 ????
|
59
|
+
date >= Date.new( @season.start_year, 7, 1 ) &&
|
60
|
+
date <= Date.new( @season.end_year, 6, 30 )
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end # method include?
|
64
|
+
alias_method :between?, :include?
|
65
|
+
end # class EventInfo
|
66
|
+
|
67
|
+
|
68
|
+
class EventInfoReader
|
69
|
+
def catalog() Import.catalog; end
|
70
|
+
|
71
|
+
|
72
|
+
def self.read( path )
|
73
|
+
txt = File.open( path, 'r:utf-8') {|f| f.read }
|
74
|
+
new( txt ).parse
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.parse( txt )
|
78
|
+
new( txt ).parse
|
79
|
+
end
|
80
|
+
|
81
|
+
def initialize( txt )
|
82
|
+
@txt = txt
|
83
|
+
end
|
84
|
+
|
85
|
+
def parse
|
86
|
+
recs = []
|
87
|
+
|
88
|
+
parse_csv( @txt ).each do |row|
|
89
|
+
league_col = row['League']
|
90
|
+
season_col = row['Season'] || row['Year']
|
91
|
+
dates_col = row['Dates']
|
92
|
+
|
93
|
+
season = Import::Season.new( season_col )
|
94
|
+
league = catalog.leagues.find!( league_col )
|
95
|
+
|
96
|
+
|
97
|
+
dates = []
|
98
|
+
if dates_col.nil? || dates_col.empty?
|
99
|
+
## do nothing; no dates - keep dates array empty
|
100
|
+
else
|
101
|
+
## squish spaces
|
102
|
+
dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
|
103
|
+
|
104
|
+
puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
|
105
|
+
|
106
|
+
### todo/check: check what parts "Aug 15" return ???
|
107
|
+
### short form for "Aug 15 -" - works?
|
108
|
+
|
109
|
+
## todo/fix!!! - check EventInfo.include?
|
110
|
+
## now allow dates with only start_date too!! (WITHOUT end_date)
|
111
|
+
parts = dates_col.split( /[ ]*[–-][ ]*/ )
|
112
|
+
if parts.size == 1
|
113
|
+
pp parts
|
114
|
+
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
115
|
+
pp dates
|
116
|
+
elsif parts.size == 2
|
117
|
+
pp parts
|
118
|
+
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
119
|
+
dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
|
120
|
+
pp dates
|
121
|
+
|
122
|
+
## assert/check if period is less than 365 days for now
|
123
|
+
diff = dates[1].to_date.jd - dates[0].to_date.jd
|
124
|
+
puts "#{diff}d"
|
125
|
+
if diff > 365
|
126
|
+
puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
|
127
|
+
exit 1
|
128
|
+
end
|
129
|
+
else
|
130
|
+
puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
|
131
|
+
pp dates_col
|
132
|
+
pp parts
|
133
|
+
exit 1
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
teams_col = row['Clubs'] || row['Teams']
|
139
|
+
goals_col = row['Goals']
|
140
|
+
|
141
|
+
## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
|
142
|
+
teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
|
143
|
+
goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
|
144
|
+
|
145
|
+
teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
|
146
|
+
goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
|
147
|
+
|
148
|
+
matches_col = row['Matches']
|
149
|
+
## note: support additions in matches (played) e.g.
|
150
|
+
# 132 + 63 Play-off-Spiele
|
151
|
+
matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
|
152
|
+
|
153
|
+
matches = if matches_col.nil? || matches_col.empty?
|
154
|
+
nil
|
155
|
+
else
|
156
|
+
if matches_col.index( '+' ) ### check for calculations
|
157
|
+
## note: for now only supports additions
|
158
|
+
matches_col.split( '+' ).reduce( 0 ) do |sum,str|
|
159
|
+
sum + str.to_i
|
160
|
+
end
|
161
|
+
else ## assume single (integer) number
|
162
|
+
matches_col.to_i
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
rec = EventInfo.new( league: league,
|
167
|
+
season: season,
|
168
|
+
start_date: dates[0],
|
169
|
+
end_date: dates[1],
|
170
|
+
teams: teams,
|
171
|
+
matches: matches,
|
172
|
+
goals: goals
|
173
|
+
)
|
174
|
+
recs << rec
|
175
|
+
end # each row
|
176
|
+
recs
|
177
|
+
end # method parse
|
178
|
+
end # class EventInfoReader
|
179
|
+
|
180
|
+
|
181
|
+
end ## module Import
|
182
|
+
end ## module SportDb
|
183
|
+
|
@@ -121,14 +121,31 @@ class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeagueP
|
|
121
121
|
values
|
122
122
|
end
|
123
123
|
|
124
|
-
def check_stage( name )
|
125
|
-
known_stages = ['regular season',
|
126
|
-
'championship round',
|
127
|
-
'relegation round',
|
128
|
-
'play-offs'
|
129
|
-
]
|
130
124
|
|
131
|
-
|
125
|
+
# note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
|
126
|
+
KNOWN_STAGES = [
|
127
|
+
'Regular Season',
|
128
|
+
'Regular Stage',
|
129
|
+
'Championship Round',
|
130
|
+
'Championship Playoff',
|
131
|
+
'Relegation Round',
|
132
|
+
'Relegation Playoff',
|
133
|
+
'Play-offs',
|
134
|
+
'Playoff Stage',
|
135
|
+
'Grunddurchgang',
|
136
|
+
'Finaldurchgang - Qualifikationsgruppe',
|
137
|
+
'Finaldurchgang - Qualifikation',
|
138
|
+
'Finaldurchgang - Meistergruppe',
|
139
|
+
'Finaldurchgang - Meister',
|
140
|
+
'EL Play-off',
|
141
|
+
'Europa League Play-off',
|
142
|
+
'Europa-League-Play-offs',
|
143
|
+
].map {|name| name.downcase.gsub( /[^a-z]/, '' ) }
|
144
|
+
|
145
|
+
|
146
|
+
def check_stage( name )
|
147
|
+
# note: normalize names e.g. downcase and remove all non a-z chars (e.g. space, dash, etc.)
|
148
|
+
if KNOWN_STAGES.include?( name.downcase.gsub( /[^a-z]/, '' ) )
|
132
149
|
## everything ok
|
133
150
|
else
|
134
151
|
puts "** !!! ERROR - no (league) stage match found for >#{name}<, add to (builtin) stages table; sorry"
|
@@ -7,21 +7,21 @@ module SportDb
|
|
7
7
|
## see https://github.com/textkit/textutils/blob/master/textutils/lib/textutils/title_mapper2.rb
|
8
8
|
|
9
9
|
|
10
|
-
class MapperV2 ## todo/check: rename to NameMapper
|
10
|
+
class MapperV2 ## todo/check: rename to NameMapper ? why? why not??
|
11
11
|
|
12
12
|
include Logging
|
13
13
|
|
14
|
-
attr_reader :
|
14
|
+
attr_reader :known_names ## rename to mapping or mappings or just names - why? why not?
|
15
15
|
|
16
16
|
########
|
17
17
|
## key: e.g. augsburg
|
18
|
-
##
|
19
|
-
## length (of
|
20
|
-
MappingStruct = Struct.new( :key, :
|
18
|
+
## name: e.g. FC Augsburg
|
19
|
+
## length (of name(!!) - not regex pattern): e.g. 11 -- do not count dots (e.g. U.S.A. => 3 or 6) why? why not?
|
20
|
+
MappingStruct = Struct.new( :key, :name, :length, :pattern) ## todo/check: use (rename to) NameStruct - why? why not??
|
21
21
|
|
22
22
|
######
|
23
23
|
## convenience helper - (auto)build ActiveRecord-like team records/structs
|
24
|
-
Record = Struct.new( :key, :
|
24
|
+
Record = Struct.new( :key, :name, :alt_names )
|
25
25
|
def build_records( txt_or_lines )
|
26
26
|
recs = []
|
27
27
|
|
@@ -44,12 +44,12 @@ class MapperV2 ## todo/check: rename to NameMapper/TitleMapper ? why? why n
|
|
44
44
|
values = line.split( '|' )
|
45
45
|
values = values.map { |value| value.strip }
|
46
46
|
|
47
|
-
|
47
|
+
name = values[0]
|
48
48
|
## note: quick hack - auto-generate key, that is, remove all non-ascii chars and downcase
|
49
|
-
key
|
50
|
-
|
49
|
+
key = name.downcase.gsub( /[^a-z]/, '' )
|
50
|
+
alt_names = values.size > 1 ? values[1..-1].join( '|' ) : nil
|
51
51
|
|
52
|
-
recs << Record.new( key,
|
52
|
+
recs << Record.new( key, name, alt_names )
|
53
53
|
end
|
54
54
|
recs
|
55
55
|
end
|
@@ -63,10 +63,10 @@ class MapperV2 ## todo/check: rename to NameMapper/TitleMapper ? why? why n
|
|
63
63
|
(records_or_mapping.is_a?( Array ) && records_or_mapping[0].is_a?( String ))
|
64
64
|
|
65
65
|
## build mapping lookup table
|
66
|
-
@
|
67
|
-
|
66
|
+
@known_names = if records_or_mapping.is_a?( Hash ) ## assume "custom" mapping hash table (name=>record)
|
67
|
+
build_name_table_for_mapping( records_or_mapping )
|
68
68
|
else ## assume array of records
|
69
|
-
|
69
|
+
build_name_table_for_records( records_or_mapping )
|
70
70
|
end
|
71
71
|
|
72
72
|
## build lookup hash by record (e.g. team/club/etc.) key
|
@@ -85,9 +85,9 @@ class MapperV2 ## todo/check: rename to NameMapper/TitleMapper ? why? why n
|
|
85
85
|
|
86
86
|
|
87
87
|
|
88
|
-
def
|
88
|
+
def map_names!( line ) ## rename to just map! - why?? why not???
|
89
89
|
begin
|
90
|
-
found =
|
90
|
+
found = map_name_for!( @tag, line, @known_names )
|
91
91
|
end while found
|
92
92
|
end
|
93
93
|
|
@@ -110,27 +110,27 @@ class MapperV2 ## todo/check: rename to NameMapper/TitleMapper ? why? why n
|
|
110
110
|
|
111
111
|
|
112
112
|
private
|
113
|
-
def
|
114
|
-
|
113
|
+
def build_name_table_for_mapping( mapping )
|
114
|
+
known_names = []
|
115
115
|
|
116
|
-
mapping.each do |
|
116
|
+
mapping.each do |name, rec|
|
117
117
|
m = MappingStruct.new
|
118
118
|
m.key = rec.key
|
119
|
-
m.
|
120
|
-
m.length =
|
121
|
-
m.pattern = Regexp.escape(
|
119
|
+
m.name = name
|
120
|
+
m.length = name.length
|
121
|
+
m.pattern = Regexp.escape( name ) ## note: just use "standard" regex escape (e.g. no extras for umlauts,accents,etc.)
|
122
122
|
|
123
|
-
|
123
|
+
known_names << m
|
124
124
|
end
|
125
125
|
|
126
126
|
## note: sort here by length (largest goes first - best match)
|
127
|
-
|
128
|
-
|
127
|
+
known_names = known_names.sort { |l,r| r.length <=> l.length }
|
128
|
+
known_names
|
129
129
|
end
|
130
130
|
|
131
|
-
def
|
131
|
+
def build_name_table_for_records( records )
|
132
132
|
|
133
|
-
## build known tracks table w/
|
133
|
+
## build known tracks table w/ alt names e.g.
|
134
134
|
#
|
135
135
|
# [[ 'wolfsbrug', 'VfL Wolfsburg'],
|
136
136
|
# [ 'augsburg', 'FC Augsburg'],
|
@@ -138,65 +138,65 @@ private
|
|
138
138
|
# [ 'augsburg', 'Augi3' ],
|
139
139
|
# [ 'stuttgart', 'VfB Stuttgart']]
|
140
140
|
|
141
|
-
|
141
|
+
known_names = []
|
142
142
|
|
143
143
|
records.each_with_index do |rec,index|
|
144
144
|
|
145
|
-
|
146
|
-
|
145
|
+
name_candidates = []
|
146
|
+
name_candidates << rec.name
|
147
147
|
|
148
|
-
|
148
|
+
name_candidates += rec.alt_names.split('|') if rec.alt_names && !rec.alt_names.empty?
|
149
149
|
|
150
150
|
|
151
|
-
## check if
|
152
|
-
# make
|
151
|
+
## check if name includes subname e.g. Grand Prix Japan (Suzuka Circuit)
|
152
|
+
# make subname optional by adding name w/o subname e.g. Grand Prix Japan
|
153
153
|
|
154
|
-
|
155
|
-
|
156
|
-
|
154
|
+
names = []
|
155
|
+
name_candidates.each do |t|
|
156
|
+
names << t
|
157
157
|
if t =~ /\(.+\)/
|
158
|
-
|
158
|
+
extra_name = t.gsub( /\(.+\)/, '' ) # remove/delete subnames
|
159
159
|
# note: strip leading n trailing withspaces too!
|
160
160
|
# -- todo: add squish or something if () is inline e.g. leaves two spaces?
|
161
|
-
|
162
|
-
|
161
|
+
extra_name.strip!
|
162
|
+
names << extra_name
|
163
163
|
end
|
164
164
|
end
|
165
165
|
|
166
|
-
|
166
|
+
names.each do |name|
|
167
167
|
m = MappingStruct.new
|
168
168
|
m.key = rec.key
|
169
|
-
m.
|
170
|
-
m.length =
|
169
|
+
m.name = name
|
170
|
+
m.length = name.length
|
171
171
|
## note: escape for regex plus allow subs for special chars/accents
|
172
|
-
m.pattern =
|
172
|
+
m.pattern = name_esc_regex( name )
|
173
173
|
|
174
|
-
|
174
|
+
known_names << m
|
175
175
|
end
|
176
176
|
|
177
|
-
logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{
|
177
|
+
logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{names.join('|')}<"
|
178
178
|
|
179
179
|
## note: only include code field - if defined
|
180
180
|
if rec.respond_to?(:code) && rec.code && !rec.code.empty?
|
181
181
|
m = MappingStruct.new
|
182
182
|
m.key = rec.key
|
183
|
-
m.
|
183
|
+
m.name = rec.code
|
184
184
|
m.length = rec.code.length
|
185
185
|
m.pattern = rec.code ## note: use code for now as is (no variants allowed fow now)
|
186
186
|
|
187
|
-
|
187
|
+
known_names << m
|
188
188
|
end
|
189
189
|
end
|
190
190
|
|
191
191
|
## note: sort here by length (largest goes first - best match)
|
192
192
|
# exclude code and key (key should always go last)
|
193
|
-
|
194
|
-
|
193
|
+
known_names = known_names.sort { |l,r| r.length <=> l.length }
|
194
|
+
known_names
|
195
195
|
end
|
196
196
|
|
197
197
|
|
198
198
|
|
199
|
-
def
|
199
|
+
def map_name_for!( tag, line, mappings )
|
200
200
|
mappings.each do |mapping|
|
201
201
|
key = mapping.key
|
202
202
|
pattern = mapping.pattern
|
@@ -234,9 +234,9 @@ private
|
|
234
234
|
|
235
235
|
|
236
236
|
####
|
237
|
-
#
|
237
|
+
# name helper cut-n-paste copy from TextUtils
|
238
238
|
## see https://github.com/textkit/textutils/blob/master/textutils/lib/textutils/helper/title_helper.rb
|
239
|
-
def
|
239
|
+
def name_esc_regex( name_unescaped )
|
240
240
|
|
241
241
|
## escape regex special chars e.g.
|
242
242
|
# . to \. and
|
@@ -257,16 +257,16 @@ def title_esc_regex( title_unescaped )
|
|
257
257
|
# e.g. Club Atlético Colón (Santa Fe)
|
258
258
|
# e.g. Bauer Anton (????)
|
259
259
|
|
260
|
-
##
|
261
|
-
##
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
260
|
+
## note: cannot use Regexp.escape! will escape space '' to '\ '
|
261
|
+
## name = Regexp.escape( name_unescaped )
|
262
|
+
name = name_unescaped.gsub( '.', '\.' )
|
263
|
+
name = name.gsub( '(', '\(' )
|
264
|
+
name = name.gsub( ')', '\)' )
|
265
|
+
name = name.gsub( '?', '\?' )
|
266
|
+
name = name.gsub( '*', '\*' )
|
267
|
+
name = name.gsub( '+', '\+' )
|
268
|
+
name = name.gsub( '$', '\$' )
|
269
|
+
name = name.gsub( '^', '\^' )
|
270
270
|
|
271
271
|
## match accented char with or without accents
|
272
272
|
## add (ü|ue) etc.
|
@@ -309,10 +309,10 @@ def title_esc_regex( title_unescaped )
|
|
309
309
|
## collect some more (real-world) examples first!!!!!
|
310
310
|
|
311
311
|
alternatives.each do |alt|
|
312
|
-
|
312
|
+
name = name.gsub( alt[0], alt[1] )
|
313
313
|
end
|
314
314
|
|
315
|
-
|
315
|
+
name
|
316
316
|
end
|
317
317
|
|
318
318
|
end # class MapperV2
|