sportdb-config 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f264abfc487652f687a3e4d6dc4388594a587896
4
- data.tar.gz: 32f24cc478d3db058ef5aeb943a279019e2d68d2
3
+ metadata.gz: e017d063fd47d049d93b49763c2549bf87412172
4
+ data.tar.gz: 84d75edb2bb440805ae792c07ac732342f7f6828
5
5
  SHA512:
6
- metadata.gz: 4e5a329a6027121d68c81d51dd683befec81eb348ba4ef4606437ca5b77bfd1346ae31e8781a4e5c086c93f5cdd053a417a359e9edefa440fbe580ee0345970d
7
- data.tar.gz: a3b7f2e23f0cd8f9755a8c594ef7c520be6b0d131762dbefff81f5c6acee0dbb15ed80128fa292f72ae704e471af28b16152cd1fd71564cc38c4f0b5efbcdf82
6
+ metadata.gz: c0c746998dd8bf6da6628174dbcd684a60d6813a85968eed939f056f2fc44df5f035a2c1680902886091d33f26143060d1518e48fbb1ba9a2af9661189ffbbbb
7
+ data.tar.gz: 1a1ec84bf35b66b891fd7162529db514cbf2975769e7f62deb203263a5b272a25df83527128119086a8134e349e18c57a4c0c3f39a56a5266ddc1f87da205244
@@ -13,8 +13,9 @@ config/world/countries.txt
13
13
  config/world/de.txt
14
14
  config/world/eng.txt
15
15
  lib/sportdb/config.rb
16
+ lib/sportdb/config/club.rb
17
+ lib/sportdb/config/club_index.rb
16
18
  lib/sportdb/config/club_reader.rb
17
- lib/sportdb/config/clubs.rb
18
19
  lib/sportdb/config/config.rb
19
20
  lib/sportdb/config/countries.rb
20
21
  lib/sportdb/config/league.rb
@@ -23,6 +24,7 @@ lib/sportdb/config/league_utils.rb
23
24
  lib/sportdb/config/season_utils.rb
24
25
  lib/sportdb/config/variants.rb
25
26
  lib/sportdb/config/version.rb
27
+ lib/sportdb/config/wiki_index.rb
26
28
  lib/sportdb/config/wiki_reader.rb
27
29
  test/helper.rb
28
30
  test/test_club_index.rb
@@ -34,4 +36,5 @@ test/test_league_reader.rb
34
36
  test/test_league_utils.rb
35
37
  test/test_season_utils.rb
36
38
  test/test_variants.rb
39
+ test/test_wiki_index.rb
37
40
  test/test_wiki_reader.rb
@@ -24,9 +24,11 @@ require 'sportdb/config/league_reader'
24
24
 
25
25
  require 'sportdb/config/variants'
26
26
  require 'sportdb/config/countries'
27
- require 'sportdb/config/clubs'
27
+ require 'sportdb/config/club'
28
28
  require 'sportdb/config/club_reader'
29
+ require 'sportdb/config/club_index'
29
30
  require 'sportdb/config/wiki_reader'
31
+ require 'sportdb/config/wiki_index'
30
32
  require 'sportdb/config/config'
31
33
 
32
34
 
@@ -0,0 +1,130 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+ ##
7
+ # note: use our own (internal) club struct for now - why? why not?
8
+ # - check that shape/structure/fields/attributes match
9
+ # the Team struct in sportdb-text (in SportDb::Struct::Team) !!!!
10
+ class Club
11
+ ## todo: use just names for alt_names - why? why not?
12
+ attr_accessor :name, :alt_names,
13
+ :year, :ground, :city
14
+
15
+ ## more attribs - todo/fix - also add "upstream" to struct & model!!!!!
16
+ attr_accessor :district, :geos, :year_end, :country
17
+
18
+ ## special import only attribs
19
+ attr_accessor :alt_names_auto ## auto-generated alt names
20
+ attr_accessor :wikipedia # wikipedia page name (for english (en))
21
+
22
+ def historic?() @year_end ? true : false; end
23
+ alias_method :past?, :historic?
24
+
25
+
26
+ def wikipedia?() @wikipedia; end
27
+ def wikipedia_url
28
+ if @wikipedia
29
+ ## note: replace spaces with underscore (-)
30
+ ## e.g. Club Brugge KV => Club_Brugge_KV
31
+ ## todo/check/fix:
32
+ ## check if "plain" dash (-) needs to get replaced with typographic dash??
33
+ "https://en.wikipedia.org/wiki/#{@wikipedia.gsub(' ','_')}"
34
+ else
35
+ nil
36
+ end
37
+ end
38
+
39
+
40
+ def initialize
41
+ @alt_names = []
42
+ @alt_names_auto = []
43
+ end
44
+
45
+
46
+ ## helper methods for import only
47
+ ## check for duplicates
48
+ def duplicates?
49
+ names = [name] + alt_names + alt_names_auto
50
+ names = names.map { |name| normalize( name ) }
51
+
52
+ names.size != names.uniq.size
53
+ end
54
+
55
+ def duplicates
56
+ names = [name] + alt_names + alt_names_auto
57
+
58
+ ## calculate (count) frequency and select if greater than one
59
+ names.reduce( Hash.new ) do |h,name|
60
+ norm = normalize( name )
61
+ h[norm] ||= []
62
+ h[norm] << name; h
63
+ end.select { |norm,names| names.size > 1 }
64
+ end
65
+
66
+ def add_variants( name_or_names )
67
+ names = name_or_names.is_a?(Array) ? name_or_names : [name_or_names]
68
+ names.each do |name|
69
+ name = sanitize( name )
70
+ self.alt_names_auto += variants( name )
71
+ end
72
+ end
73
+
74
+ ###################################
75
+ # "global" helper - move to ___ ? why? why not?
76
+
77
+ YEAR_REGEX = /\([0-9,\- ]+?\)/
78
+ def self.strip_year( name )
79
+ ## check for year(s) e.g. (1887-1911), (-2013),
80
+ ## (1946-2001, 2013-) etc.
81
+ name.gsub( YEAR_REGEX, '' ).strip
82
+ end
83
+
84
+ def self.has_year?( name ) name =~ YEAR_REGEX; end
85
+
86
+ LANG_REGEX = /\[[a-z]{2}\]/
87
+ def self.strip_lang( name )
88
+ name.gsub( LANG_REGEX, '' ).strip
89
+ end
90
+
91
+ def self.has_lang?( name ) name =~ LANG_REGEX; end
92
+
93
+ NORM_REGEX = /[.'º\-\/]/
94
+ ## note: remove all dots (.), dash (-), ', º, /, etc.
95
+ ## for norm(alizing) names
96
+ def self.strip_norm( name )
97
+ name.gsub( NORM_REGEX, '' )
98
+ end
99
+
100
+ def strip_year( name ) self.class.strip_year( name ); end
101
+ def strip_lang( name ) self.class.strip_lang( name ); end
102
+ def strip_norm( name ) self.class.strip_norm( name ); end
103
+
104
+ private
105
+ def sanitize( name )
106
+ ## check for year(s) e.g. (1887-1911), (-2013),
107
+ ## (1946-2001,2013-) etc.
108
+ name = strip_year( name )
109
+ ## check lang codes e.g. [en], [fr], etc.
110
+ name = strip_lang( name )
111
+ name
112
+ end
113
+
114
+ def normalize( name )
115
+ name = sanitize( name )
116
+
117
+ ## remove all dots (.), dash (-), º, /, etc.
118
+ name = strip_norm( name )
119
+ name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
120
+
121
+ ## todo/fix: use our own downcase - why? why not?
122
+ name = name.downcase ## do NOT care about upper and lowercase for now
123
+ name
124
+ end
125
+
126
+ def variants( name ) Variant.find( name ); end
127
+ end # class Club
128
+
129
+ end # module Import
130
+ end # module SportDb
@@ -3,134 +3,34 @@
3
3
  module SportDb
4
4
  module Import
5
5
 
6
- ##
7
- # note: use our own (internal) club struct for now - why? why not?
8
- # - check that shape/structure/fields/attributes match
9
- # the Team struct in sportdb-text (in SportDb::Struct::Team) !!!!
10
- class Club
11
- ## todo: use just names for alt_names - why? why not?
12
- attr_accessor :name, :alt_names,
13
- :year, :ground, :city
14
-
15
- ## more attribs - todo/fix - also add "upstream" to struct & model!!!!!
16
- attr_accessor :district, :geos, :year_end, :country
17
-
18
- ## special import only attribs
19
- attr_accessor :alt_names_auto ## auto-generated alt names
20
- attr_accessor :wikipedia # wikipedia page name (for english (en))
21
-
22
- def historic?() @year_end ? true : false; end
23
- alias_method :past?, :historic?
24
-
25
-
26
- def wikipedia?() @wikipedia; end
27
- def wikipedia_url
28
- if @wikipedia
29
- ## note: replace spaces with underscore (-)
30
- ## e.g. Club Brugge KV => Club_Brugge_KV
31
- ## todo/check/fix:
32
- ## check if "plain" dash (-) needs to get replaced with typographic dash??
33
- "https://en.wikipedia.org/wiki/#{@wikipedia.gsub(' ','_')}"
34
- else
35
- nil
36
- end
37
- end
38
-
39
-
40
- def initialize
41
- @alt_names = []
42
- @alt_names_auto = []
43
- end
44
-
45
-
46
- ## helper methods for import only
47
- ## check for duplicates
48
- def duplicates?
49
- names = [name] + alt_names + alt_names_auto
50
- names = names.map { |name| normalize( name ) }
51
-
52
- names.size != names.uniq.size
53
- end
54
-
55
- def duplicates
56
- names = [name] + alt_names + alt_names_auto
57
6
 
58
- ## calculate (count) frequency and select if greater than one
59
- names.reduce( Hash.new ) do |h,name|
60
- norm = normalize( name )
61
- h[norm] ||= []
62
- h[norm] << name; h
63
- end.select { |norm,names| names.size > 1 }
64
- end
7
+ class ClubIndex
65
8
 
66
- def add_variants( name_or_names )
67
- names = name_or_names.is_a?(Array) ? name_or_names : [name_or_names]
68
- names.each do |name|
69
- name = sanitize( name )
70
- self.alt_names_auto += variants( name )
9
+ def self.build( path )
10
+ recs = []
11
+ datafiles = Configuration.find_datafiles_clubs( path )
12
+ datafiles.each do |datafile|
13
+ recs += ClubReader.read( datafile )
71
14
  end
72
- end
73
-
74
- ###################################
75
- # "global" helper - move to ___ ? why? why not?
76
-
77
- YEAR_REGEX = /\([0-9,\- ]+?\)/
78
- def self.strip_year( name )
79
- ## check for year(s) e.g. (1887-1911), (-2013),
80
- ## (1946-2001, 2013-) etc.
81
- name.gsub( YEAR_REGEX, '' ).strip
82
- end
83
-
84
- def self.has_year?( name ) name =~ YEAR_REGEX; end
85
-
86
- LANG_REGEX = /\[[a-z]{2}\]/
87
- def self.strip_lang( name )
88
- name.gsub( LANG_REGEX, '' ).strip
89
- end
15
+ recs
90
16
 
91
- def self.has_lang?( name ) name =~ LANG_REGEX; end
17
+ clubs = self.new
18
+ clubs.add( recs )
92
19
 
93
- NORM_REGEX = /[.'º\-\/]/
94
- ## note: remove all dots (.), dash (-), ', º, /, etc.
95
- ## for norm(alizing) names
96
- def self.strip_norm( name )
97
- name.gsub( NORM_REGEX, '' )
98
- end
99
-
100
- def strip_year( name ) self.class.strip_year( name ); end
101
- def strip_lang( name ) self.class.strip_lang( name ); end
102
- def strip_norm( name ) self.class.strip_norm( name ); end
103
-
104
- private
105
- def sanitize( name )
106
- ## check for year(s) e.g. (1887-1911), (-2013),
107
- ## (1946-2001,2013-) etc.
108
- name = strip_year( name )
109
- ## check lang codes e.g. [en], [fr], etc.
110
- name = strip_lang( name )
111
- name
112
- end
113
-
114
- def normalize( name )
115
- name = sanitize( name )
116
-
117
- ## remove all dots (.), dash (-), º, /, etc.
118
- name = strip_norm( name )
119
- name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
20
+ ## add wiki(pedia) anchored links
21
+ recs = []
22
+ datafiles = Configuration.find_datafiles_clubs_wiki( path )
23
+ datafiles.each do |datafile|
24
+ recs += WikiReader.read( datafile )
25
+ end
120
26
 
121
- ## todo/fix: use our own downcase - why? why not?
122
- name = name.downcase ## do NOT care about upper and lowercase for now
123
- name
27
+ pp recs
28
+ clubs.add_wiki( recs )
29
+ clubs
124
30
  end
125
31
 
126
- def variants( name ) Variant.find( name ); end
127
- end # class Club
128
32
 
129
33
 
130
-
131
-
132
- class ClubIndex
133
-
134
34
  def initialize
135
35
  @clubs = {} ## clubs (indexed) by canonical name
136
36
  @clubs_by_name = {}
@@ -155,13 +55,24 @@ class ClubIndex
155
55
  recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
156
56
 
157
57
  recs.each do |rec|
158
- m = match_by( name: rec.name, country: rec.country )
58
+ ## note: strip qualifier () from wikipedia page name if present
59
+ ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
60
+ ## Willem II (football club) => Willem II
61
+ ##
62
+ ## e.g. do NOT strip others !! e.g.
63
+ ## América Futebol Clube (MG)
64
+ ## only add more "special" cases on demand (that, is) if we find more
65
+ name = rec.name
66
+ name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
67
+ name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
68
+
69
+ m = match_by( name: name, country: rec.country )
159
70
  if m.nil?
160
- puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{rec.name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
71
+ puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
161
72
  exit 1
162
73
  end
163
74
  if m.size > 1
164
- puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{rec.name}, #{rec.country.name} (#{rec.country.key})<"
75
+ puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
165
76
  pp m
166
77
  exit 1
167
78
  end
@@ -47,7 +47,7 @@ class Configuration
47
47
  }x
48
48
 
49
49
 
50
- def find_clubs_datafiles( path, pattern )
50
+ def self.find_datafiles( path, pattern )
51
51
  datafiles = [] ## note: [country, path] pairs for now
52
52
 
53
53
  ## check all txt files as candidates (MUST include country code for now)
@@ -60,38 +60,17 @@ class Configuration
60
60
  pp datafiles
61
61
  datafiles
62
62
  end
63
+ def self.find_datafiles_clubs( path ) find_datafiles( path, CLUBS_REGEX ); end
64
+ def self.find_datafiles_clubs_wiki( path ) find_datafiles( path, CLUBS_WIKI_REGEX ); end
63
65
 
64
66
 
65
67
  def build_club_index
66
68
  ## unify team names; team (builtin/known/shared) name mappings
67
69
  ## cleanup team names - use local ("native") name with umlaut etc.
68
- recs = []
69
-
70
- ## todo/fix: pass along / use country code too
71
- ## note: country code no longer needed in path (is now expected as heading inside the file)
72
-
73
70
  ## todo/fix: add to teamreader
74
71
  ## check that name and alt_names for a club are all unique (not duplicates)
75
- datafiles = find_clubs_datafiles( clubs_dir, CLUBS_REGEX )
76
- datafiles.each do |datafile|
77
- recs += ClubReader.read( datafile )
78
- end
79
-
80
-
81
- clubs = ClubIndex.new
82
- clubs.add( recs )
83
-
84
- ## add wiki(pedia) anchored links
85
- recs = []
86
- datafiles = find_clubs_datafiles( clubs_dir, CLUBS_WIKI_REGEX )
87
- datafiles.each do |datafile|
88
- recs += WikiReader.read( datafile )
89
- end
90
-
91
- pp recs
92
- clubs.add_wiki( recs )
93
-
94
72
 
73
+ clubs = ClubIndex.build( clubs_dir )
95
74
  if clubs.errors?
96
75
  puts ""
97
76
  puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
@@ -122,8 +101,6 @@ end # class Configuration
122
101
 
123
102
 
124
103
 
125
-
126
-
127
104
  ## lets you use
128
105
  ## SportDb::Import.configure do |config|
129
106
  ## config.hello = 'World'
@@ -8,7 +8,7 @@ module Boot ## note: use a different module than Config to avoid confusion
8
8
 
9
9
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
10
10
  MINOR = 5
11
- PATCH = 1
11
+ PATCH = 2
12
12
  VERSION = [MAJOR,MINOR,PATCH].join('.')
13
13
 
14
14
  def self.version
@@ -0,0 +1,70 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class WikiIndex
8
+
9
+ def self.build( path )
10
+ recs = []
11
+ datafiles = Configuration.find_datafiles_clubs_wiki( path )
12
+ datafiles.each do |datafile|
13
+ recs += WikiReader.read( datafile )
14
+ end
15
+ recs
16
+
17
+ self.new( recs )
18
+ end
19
+
20
+ def initialize( recs )
21
+ @pages_by_country = {}
22
+
23
+ ## todo/fix:
24
+ ## check for duplicate recs - report and exit on dupliate!!!!!!
25
+ recs.each do |rec|
26
+ h = @pages_by_country[ rec.country.key ] ||= {}
27
+ h[ normalize(rec.name) ] = rec
28
+ end
29
+ end
30
+
31
+
32
+ def normalize( name )
33
+ ## todo/fix: (re)use normalize from Club!!!!
34
+ name = name.gsub( /[\-\.]/, '' )
35
+ name = name.gsub( ' ', '' ) ## remove spaces too
36
+ name = name.downcase
37
+ name
38
+ end
39
+
40
+
41
+
42
+ def find_by( club: ) ## todo/check: use find_by_club - why? why not?
43
+ find_by_club( club )
44
+ end
45
+
46
+ def find_by_club( club )
47
+ rec = nil
48
+
49
+ ## get query params from club
50
+ names = [club.name]+club.alt_names
51
+ country_key = club.country.key
52
+
53
+ h = @pages_by_country[ country_key ]
54
+ if h
55
+ ## todo/check: sort names ?
56
+ ## sort by longest first (for best match)
57
+ names.each do |name|
58
+ ## todo/fix: name - remove/string year and lang e.g. (1946-2001), [en]!!!!
59
+ rec = h[ normalize( name ) ]
60
+ break if rec ## bingo!! found - break on first match
61
+ end
62
+ end
63
+
64
+ rec ## note: return nil if nothing found
65
+ end
66
+ end # class WikiIndex
67
+
68
+
69
+ end # module Import
70
+ end # module SportDb
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_wiki_index.rb
6
+
7
+
8
+ require 'helper'
9
+
10
+ class TestWikiIndex < MiniTest::Test
11
+
12
+ def test_clubs
13
+ wiki = SportDb::Import::WikiIndex.build( SportDb::Import.config.clubs_dir )
14
+ ## pp wiki
15
+
16
+ ##############################################
17
+ ## test wikipedia names and links/urls
18
+ be = SportDb::Import.config.countries[ 'be' ]
19
+
20
+ club = SportDb::Import::Club.new
21
+ club.name = 'Club Brugge KV'
22
+ club.country = be
23
+
24
+ rec = wiki.find_by( club: club )
25
+ assert_equal 'Club Brugge KV', rec.name
26
+
27
+
28
+ club = SportDb::Import::Club.new
29
+ club.name = 'RSC Anderlecht'
30
+ club.country = be
31
+
32
+ rec = wiki.find_by( club: club )
33
+ assert_equal 'R.S.C. Anderlecht', rec.name
34
+ end
35
+
36
+ end # class TestWikiIndex
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-config
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-03 00:00:00.000000000 Z
11
+ date: 2019-08-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csvreader
@@ -86,8 +86,9 @@ files:
86
86
  - config/world/de.txt
87
87
  - config/world/eng.txt
88
88
  - lib/sportdb/config.rb
89
+ - lib/sportdb/config/club.rb
90
+ - lib/sportdb/config/club_index.rb
89
91
  - lib/sportdb/config/club_reader.rb
90
- - lib/sportdb/config/clubs.rb
91
92
  - lib/sportdb/config/config.rb
92
93
  - lib/sportdb/config/countries.rb
93
94
  - lib/sportdb/config/league.rb
@@ -96,6 +97,7 @@ files:
96
97
  - lib/sportdb/config/season_utils.rb
97
98
  - lib/sportdb/config/variants.rb
98
99
  - lib/sportdb/config/version.rb
100
+ - lib/sportdb/config/wiki_index.rb
99
101
  - lib/sportdb/config/wiki_reader.rb
100
102
  - test/helper.rb
101
103
  - test/test_club_index.rb
@@ -107,6 +109,7 @@ files:
107
109
  - test/test_league_utils.rb
108
110
  - test/test_season_utils.rb
109
111
  - test/test_variants.rb
112
+ - test/test_wiki_index.rb
110
113
  - test/test_wiki_reader.rb
111
114
  homepage: https://github.com/sportdb/sport.db
112
115
  licenses: