sportdb-config 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f264abfc487652f687a3e4d6dc4388594a587896
4
- data.tar.gz: 32f24cc478d3db058ef5aeb943a279019e2d68d2
3
+ metadata.gz: e017d063fd47d049d93b49763c2549bf87412172
4
+ data.tar.gz: 84d75edb2bb440805ae792c07ac732342f7f6828
5
5
  SHA512:
6
- metadata.gz: 4e5a329a6027121d68c81d51dd683befec81eb348ba4ef4606437ca5b77bfd1346ae31e8781a4e5c086c93f5cdd053a417a359e9edefa440fbe580ee0345970d
7
- data.tar.gz: a3b7f2e23f0cd8f9755a8c594ef7c520be6b0d131762dbefff81f5c6acee0dbb15ed80128fa292f72ae704e471af28b16152cd1fd71564cc38c4f0b5efbcdf82
6
+ metadata.gz: c0c746998dd8bf6da6628174dbcd684a60d6813a85968eed939f056f2fc44df5f035a2c1680902886091d33f26143060d1518e48fbb1ba9a2af9661189ffbbbb
7
+ data.tar.gz: 1a1ec84bf35b66b891fd7162529db514cbf2975769e7f62deb203263a5b272a25df83527128119086a8134e349e18c57a4c0c3f39a56a5266ddc1f87da205244
@@ -13,8 +13,9 @@ config/world/countries.txt
13
13
  config/world/de.txt
14
14
  config/world/eng.txt
15
15
  lib/sportdb/config.rb
16
+ lib/sportdb/config/club.rb
17
+ lib/sportdb/config/club_index.rb
16
18
  lib/sportdb/config/club_reader.rb
17
- lib/sportdb/config/clubs.rb
18
19
  lib/sportdb/config/config.rb
19
20
  lib/sportdb/config/countries.rb
20
21
  lib/sportdb/config/league.rb
@@ -23,6 +24,7 @@ lib/sportdb/config/league_utils.rb
23
24
  lib/sportdb/config/season_utils.rb
24
25
  lib/sportdb/config/variants.rb
25
26
  lib/sportdb/config/version.rb
27
+ lib/sportdb/config/wiki_index.rb
26
28
  lib/sportdb/config/wiki_reader.rb
27
29
  test/helper.rb
28
30
  test/test_club_index.rb
@@ -34,4 +36,5 @@ test/test_league_reader.rb
34
36
  test/test_league_utils.rb
35
37
  test/test_season_utils.rb
36
38
  test/test_variants.rb
39
+ test/test_wiki_index.rb
37
40
  test/test_wiki_reader.rb
@@ -24,9 +24,11 @@ require 'sportdb/config/league_reader'
24
24
 
25
25
  require 'sportdb/config/variants'
26
26
  require 'sportdb/config/countries'
27
- require 'sportdb/config/clubs'
27
+ require 'sportdb/config/club'
28
28
  require 'sportdb/config/club_reader'
29
+ require 'sportdb/config/club_index'
29
30
  require 'sportdb/config/wiki_reader'
31
+ require 'sportdb/config/wiki_index'
30
32
  require 'sportdb/config/config'
31
33
 
32
34
 
@@ -0,0 +1,130 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+ ##
7
+ # note: use our own (internal) club struct for now - why? why not?
8
+ # - check that shape/structure/fields/attributes match
9
+ # the Team struct in sportdb-text (in SportDb::Struct::Team) !!!!
10
+ class Club
11
+ ## todo: use just names for alt_names - why? why not?
12
+ attr_accessor :name, :alt_names,
13
+ :year, :ground, :city
14
+
15
+ ## more attribs - todo/fix - also add "upstream" to struct & model!!!!!
16
+ attr_accessor :district, :geos, :year_end, :country
17
+
18
+ ## special import only attribs
19
+ attr_accessor :alt_names_auto ## auto-generated alt names
20
+ attr_accessor :wikipedia # wikipedia page name (for english (en))
21
+
22
+ def historic?() @year_end ? true : false; end
23
+ alias_method :past?, :historic?
24
+
25
+
26
+ def wikipedia?() @wikipedia; end
27
+ def wikipedia_url
28
+ if @wikipedia
29
+ ## note: replace spaces with underscore (-)
30
+ ## e.g. Club Brugge KV => Club_Brugge_KV
31
+ ## todo/check/fix:
32
+ ## check if "plain" dash (-) needs to get replaced with typographic dash??
33
+ "https://en.wikipedia.org/wiki/#{@wikipedia.gsub(' ','_')}"
34
+ else
35
+ nil
36
+ end
37
+ end
38
+
39
+
40
+ def initialize
41
+ @alt_names = []
42
+ @alt_names_auto = []
43
+ end
44
+
45
+
46
+ ## helper methods for import only
47
+ ## check for duplicates
48
+ def duplicates?
49
+ names = [name] + alt_names + alt_names_auto
50
+ names = names.map { |name| normalize( name ) }
51
+
52
+ names.size != names.uniq.size
53
+ end
54
+
55
+ def duplicates
56
+ names = [name] + alt_names + alt_names_auto
57
+
58
+ ## calculate (count) frequency and select if greater than one
59
+ names.reduce( Hash.new ) do |h,name|
60
+ norm = normalize( name )
61
+ h[norm] ||= []
62
+ h[norm] << name; h
63
+ end.select { |norm,names| names.size > 1 }
64
+ end
65
+
66
+ def add_variants( name_or_names )
67
+ names = name_or_names.is_a?(Array) ? name_or_names : [name_or_names]
68
+ names.each do |name|
69
+ name = sanitize( name )
70
+ self.alt_names_auto += variants( name )
71
+ end
72
+ end
73
+
74
+ ###################################
75
+ # "global" helper - move to ___ ? why? why not?
76
+
77
+ YEAR_REGEX = /\([0-9,\- ]+?\)/
78
+ def self.strip_year( name )
79
+ ## check for year(s) e.g. (1887-1911), (-2013),
80
+ ## (1946-2001, 2013-) etc.
81
+ name.gsub( YEAR_REGEX, '' ).strip
82
+ end
83
+
84
+ def self.has_year?( name ) name =~ YEAR_REGEX; end
85
+
86
+ LANG_REGEX = /\[[a-z]{2}\]/
87
+ def self.strip_lang( name )
88
+ name.gsub( LANG_REGEX, '' ).strip
89
+ end
90
+
91
+ def self.has_lang?( name ) name =~ LANG_REGEX; end
92
+
93
+ NORM_REGEX = /[.'º\-\/]/
94
+ ## note: remove all dots (.), dash (-), ', º, /, etc.
95
+ ## for norm(alizing) names
96
+ def self.strip_norm( name )
97
+ name.gsub( NORM_REGEX, '' )
98
+ end
99
+
100
+ def strip_year( name ) self.class.strip_year( name ); end
101
+ def strip_lang( name ) self.class.strip_lang( name ); end
102
+ def strip_norm( name ) self.class.strip_norm( name ); end
103
+
104
+ private
105
+ def sanitize( name )
106
+ ## check for year(s) e.g. (1887-1911), (-2013),
107
+ ## (1946-2001,2013-) etc.
108
+ name = strip_year( name )
109
+ ## check lang codes e.g. [en], [fr], etc.
110
+ name = strip_lang( name )
111
+ name
112
+ end
113
+
114
+ def normalize( name )
115
+ name = sanitize( name )
116
+
117
+ ## remove all dots (.), dash (-), º, /, etc.
118
+ name = strip_norm( name )
119
+ name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
120
+
121
+ ## todo/fix: use our own downcase - why? why not?
122
+ name = name.downcase ## do NOT care about upper and lowercase for now
123
+ name
124
+ end
125
+
126
+ def variants( name ) Variant.find( name ); end
127
+ end # class Club
128
+
129
+ end # module Import
130
+ end # module SportDb
@@ -3,134 +3,34 @@
3
3
  module SportDb
4
4
  module Import
5
5
 
6
- ##
7
- # note: use our own (internal) club struct for now - why? why not?
8
- # - check that shape/structure/fields/attributes match
9
- # the Team struct in sportdb-text (in SportDb::Struct::Team) !!!!
10
- class Club
11
- ## todo: use just names for alt_names - why? why not?
12
- attr_accessor :name, :alt_names,
13
- :year, :ground, :city
14
-
15
- ## more attribs - todo/fix - also add "upstream" to struct & model!!!!!
16
- attr_accessor :district, :geos, :year_end, :country
17
-
18
- ## special import only attribs
19
- attr_accessor :alt_names_auto ## auto-generated alt names
20
- attr_accessor :wikipedia # wikipedia page name (for english (en))
21
-
22
- def historic?() @year_end ? true : false; end
23
- alias_method :past?, :historic?
24
-
25
-
26
- def wikipedia?() @wikipedia; end
27
- def wikipedia_url
28
- if @wikipedia
29
- ## note: replace spaces with underscore (-)
30
- ## e.g. Club Brugge KV => Club_Brugge_KV
31
- ## todo/check/fix:
32
- ## check if "plain" dash (-) needs to get replaced with typographic dash??
33
- "https://en.wikipedia.org/wiki/#{@wikipedia.gsub(' ','_')}"
34
- else
35
- nil
36
- end
37
- end
38
-
39
-
40
- def initialize
41
- @alt_names = []
42
- @alt_names_auto = []
43
- end
44
-
45
-
46
- ## helper methods for import only
47
- ## check for duplicates
48
- def duplicates?
49
- names = [name] + alt_names + alt_names_auto
50
- names = names.map { |name| normalize( name ) }
51
-
52
- names.size != names.uniq.size
53
- end
54
-
55
- def duplicates
56
- names = [name] + alt_names + alt_names_auto
57
6
 
58
- ## calculate (count) frequency and select if greater than one
59
- names.reduce( Hash.new ) do |h,name|
60
- norm = normalize( name )
61
- h[norm] ||= []
62
- h[norm] << name; h
63
- end.select { |norm,names| names.size > 1 }
64
- end
7
+ class ClubIndex
65
8
 
66
- def add_variants( name_or_names )
67
- names = name_or_names.is_a?(Array) ? name_or_names : [name_or_names]
68
- names.each do |name|
69
- name = sanitize( name )
70
- self.alt_names_auto += variants( name )
9
+ def self.build( path )
10
+ recs = []
11
+ datafiles = Configuration.find_datafiles_clubs( path )
12
+ datafiles.each do |datafile|
13
+ recs += ClubReader.read( datafile )
71
14
  end
72
- end
73
-
74
- ###################################
75
- # "global" helper - move to ___ ? why? why not?
76
-
77
- YEAR_REGEX = /\([0-9,\- ]+?\)/
78
- def self.strip_year( name )
79
- ## check for year(s) e.g. (1887-1911), (-2013),
80
- ## (1946-2001, 2013-) etc.
81
- name.gsub( YEAR_REGEX, '' ).strip
82
- end
83
-
84
- def self.has_year?( name ) name =~ YEAR_REGEX; end
85
-
86
- LANG_REGEX = /\[[a-z]{2}\]/
87
- def self.strip_lang( name )
88
- name.gsub( LANG_REGEX, '' ).strip
89
- end
15
+ recs
90
16
 
91
- def self.has_lang?( name ) name =~ LANG_REGEX; end
17
+ clubs = self.new
18
+ clubs.add( recs )
92
19
 
93
- NORM_REGEX = /[.'º\-\/]/
94
- ## note: remove all dots (.), dash (-), ', º, /, etc.
95
- ## for norm(alizing) names
96
- def self.strip_norm( name )
97
- name.gsub( NORM_REGEX, '' )
98
- end
99
-
100
- def strip_year( name ) self.class.strip_year( name ); end
101
- def strip_lang( name ) self.class.strip_lang( name ); end
102
- def strip_norm( name ) self.class.strip_norm( name ); end
103
-
104
- private
105
- def sanitize( name )
106
- ## check for year(s) e.g. (1887-1911), (-2013),
107
- ## (1946-2001,2013-) etc.
108
- name = strip_year( name )
109
- ## check lang codes e.g. [en], [fr], etc.
110
- name = strip_lang( name )
111
- name
112
- end
113
-
114
- def normalize( name )
115
- name = sanitize( name )
116
-
117
- ## remove all dots (.), dash (-), º, /, etc.
118
- name = strip_norm( name )
119
- name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
20
+ ## add wiki(pedia) anchored links
21
+ recs = []
22
+ datafiles = Configuration.find_datafiles_clubs_wiki( path )
23
+ datafiles.each do |datafile|
24
+ recs += WikiReader.read( datafile )
25
+ end
120
26
 
121
- ## todo/fix: use our own downcase - why? why not?
122
- name = name.downcase ## do NOT care about upper and lowercase for now
123
- name
27
+ pp recs
28
+ clubs.add_wiki( recs )
29
+ clubs
124
30
  end
125
31
 
126
- def variants( name ) Variant.find( name ); end
127
- end # class Club
128
32
 
129
33
 
130
-
131
-
132
- class ClubIndex
133
-
134
34
  def initialize
135
35
  @clubs = {} ## clubs (indexed) by canonical name
136
36
  @clubs_by_name = {}
@@ -155,13 +55,24 @@ class ClubIndex
155
55
  recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
156
56
 
157
57
  recs.each do |rec|
158
- m = match_by( name: rec.name, country: rec.country )
58
+ ## note: strip qualifier () from wikipedia page name if present
59
+ ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
60
+ ## Willem II (football club) => Willem II
61
+ ##
62
+ ## e.g. do NOT strip others !! e.g.
63
+ ## América Futebol Clube (MG)
64
+ ## only add more "special" cases on demand (that, is) if we find more
65
+ name = rec.name
66
+ name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
67
+ name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
68
+
69
+ m = match_by( name: name, country: rec.country )
159
70
  if m.nil?
160
- puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{rec.name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
71
+ puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
161
72
  exit 1
162
73
  end
163
74
  if m.size > 1
164
- puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{rec.name}, #{rec.country.name} (#{rec.country.key})<"
75
+ puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
165
76
  pp m
166
77
  exit 1
167
78
  end
@@ -47,7 +47,7 @@ class Configuration
47
47
  }x
48
48
 
49
49
 
50
- def find_clubs_datafiles( path, pattern )
50
+ def self.find_datafiles( path, pattern )
51
51
  datafiles = [] ## note: [country, path] pairs for now
52
52
 
53
53
  ## check all txt files as candidates (MUST include country code for now)
@@ -60,38 +60,17 @@ class Configuration
60
60
  pp datafiles
61
61
  datafiles
62
62
  end
63
+ def self.find_datafiles_clubs( path ) find_datafiles( path, CLUBS_REGEX ); end
64
+ def self.find_datafiles_clubs_wiki( path ) find_datafiles( path, CLUBS_WIKI_REGEX ); end
63
65
 
64
66
 
65
67
  def build_club_index
66
68
  ## unify team names; team (builtin/known/shared) name mappings
67
69
  ## cleanup team names - use local ("native") name with umlaut etc.
68
- recs = []
69
-
70
- ## todo/fix: pass along / use country code too
71
- ## note: country code no longer needed in path (is now expected as heading inside the file)
72
-
73
70
  ## todo/fix: add to teamreader
74
71
  ## check that name and alt_names for a club are all unique (not duplicates)
75
- datafiles = find_clubs_datafiles( clubs_dir, CLUBS_REGEX )
76
- datafiles.each do |datafile|
77
- recs += ClubReader.read( datafile )
78
- end
79
-
80
-
81
- clubs = ClubIndex.new
82
- clubs.add( recs )
83
-
84
- ## add wiki(pedia) anchored links
85
- recs = []
86
- datafiles = find_clubs_datafiles( clubs_dir, CLUBS_WIKI_REGEX )
87
- datafiles.each do |datafile|
88
- recs += WikiReader.read( datafile )
89
- end
90
-
91
- pp recs
92
- clubs.add_wiki( recs )
93
-
94
72
 
73
+ clubs = ClubIndex.build( clubs_dir )
95
74
  if clubs.errors?
96
75
  puts ""
97
76
  puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
@@ -122,8 +101,6 @@ end # class Configuration
122
101
 
123
102
 
124
103
 
125
-
126
-
127
104
  ## lets you use
128
105
  ## SportDb::Import.configure do |config|
129
106
  ## config.hello = 'World'
@@ -8,7 +8,7 @@ module Boot ## note: use a different module than Config to avoid confusion
8
8
 
9
9
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
10
10
  MINOR = 5
11
- PATCH = 1
11
+ PATCH = 2
12
12
  VERSION = [MAJOR,MINOR,PATCH].join('.')
13
13
 
14
14
  def self.version
@@ -0,0 +1,70 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class WikiIndex
8
+
9
+ def self.build( path )
10
+ recs = []
11
+ datafiles = Configuration.find_datafiles_clubs_wiki( path )
12
+ datafiles.each do |datafile|
13
+ recs += WikiReader.read( datafile )
14
+ end
15
+ recs
16
+
17
+ self.new( recs )
18
+ end
19
+
20
+ def initialize( recs )
21
+ @pages_by_country = {}
22
+
23
+ ## todo/fix:
24
+ ## check for duplicate recs - report and exit on dupliate!!!!!!
25
+ recs.each do |rec|
26
+ h = @pages_by_country[ rec.country.key ] ||= {}
27
+ h[ normalize(rec.name) ] = rec
28
+ end
29
+ end
30
+
31
+
32
+ def normalize( name )
33
+ ## todo/fix: (re)use normalize from Club!!!!
34
+ name = name.gsub( /[\-\.]/, '' )
35
+ name = name.gsub( ' ', '' ) ## remove spaces too
36
+ name = name.downcase
37
+ name
38
+ end
39
+
40
+
41
+
42
+ def find_by( club: ) ## todo/check: use find_by_club - why? why not?
43
+ find_by_club( club )
44
+ end
45
+
46
+ def find_by_club( club )
47
+ rec = nil
48
+
49
+ ## get query params from club
50
+ names = [club.name]+club.alt_names
51
+ country_key = club.country.key
52
+
53
+ h = @pages_by_country[ country_key ]
54
+ if h
55
+ ## todo/check: sort names ?
56
+ ## sort by longest first (for best match)
57
+ names.each do |name|
58
+ ## todo/fix: name - remove/string year and lang e.g. (1946-2001), [en]!!!!
59
+ rec = h[ normalize( name ) ]
60
+ break if rec ## bingo!! found - break on first match
61
+ end
62
+ end
63
+
64
+ rec ## note: return nil if nothing found
65
+ end
66
+ end # class WikiIndex
67
+
68
+
69
+ end # module Import
70
+ end # module SportDb
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_wiki_index.rb
6
+
7
+
8
+ require 'helper'
9
+
10
+ class TestWikiIndex < MiniTest::Test
11
+
12
+ def test_clubs
13
+ wiki = SportDb::Import::WikiIndex.build( SportDb::Import.config.clubs_dir )
14
+ ## pp wiki
15
+
16
+ ##############################################
17
+ ## test wikipedia names and links/urls
18
+ be = SportDb::Import.config.countries[ 'be' ]
19
+
20
+ club = SportDb::Import::Club.new
21
+ club.name = 'Club Brugge KV'
22
+ club.country = be
23
+
24
+ rec = wiki.find_by( club: club )
25
+ assert_equal 'Club Brugge KV', rec.name
26
+
27
+
28
+ club = SportDb::Import::Club.new
29
+ club.name = 'RSC Anderlecht'
30
+ club.country = be
31
+
32
+ rec = wiki.find_by( club: club )
33
+ assert_equal 'R.S.C. Anderlecht', rec.name
34
+ end
35
+
36
+ end # class TestWikiIndex
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sportdb-config
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-03 00:00:00.000000000 Z
11
+ date: 2019-08-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csvreader
@@ -86,8 +86,9 @@ files:
86
86
  - config/world/de.txt
87
87
  - config/world/eng.txt
88
88
  - lib/sportdb/config.rb
89
+ - lib/sportdb/config/club.rb
90
+ - lib/sportdb/config/club_index.rb
89
91
  - lib/sportdb/config/club_reader.rb
90
- - lib/sportdb/config/clubs.rb
91
92
  - lib/sportdb/config/config.rb
92
93
  - lib/sportdb/config/countries.rb
93
94
  - lib/sportdb/config/league.rb
@@ -96,6 +97,7 @@ files:
96
97
  - lib/sportdb/config/season_utils.rb
97
98
  - lib/sportdb/config/variants.rb
98
99
  - lib/sportdb/config/version.rb
100
+ - lib/sportdb/config/wiki_index.rb
99
101
  - lib/sportdb/config/wiki_reader.rb
100
102
  - test/helper.rb
101
103
  - test/test_club_index.rb
@@ -107,6 +109,7 @@ files:
107
109
  - test/test_league_utils.rb
108
110
  - test/test_season_utils.rb
109
111
  - test/test_variants.rb
112
+ - test/test_wiki_index.rb
110
113
  - test/test_wiki_reader.rb
111
114
  homepage: https://github.com/sportdb/sport.db
112
115
  licenses: