datafile 0.2.5 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +0 -0
- data/Manifest.txt +23 -30
- data/Rakefile +2 -3
- data/data/football.txt +20 -11
- data/lib/datafile.rb +29 -19
- data/lib/datafile/builder.rb +1 -24
- data/lib/datafile/datafile.rb +25 -144
- data/lib/datafile/{datasets/dataset.rb → dataset.rb} +17 -25
- data/lib/datafile/version.rb +7 -4
- data/lib/datafile/workers/file/dataset.rb +20 -57
- data/lib/datafile/workers/file/registry.rb +18 -18
- data/lib/datafile/workers/file/worker.rb +4 -24
- data/lib/datafile/workers/zip/dataset.rb +19 -8
- data/lib/datafile/workers/zip/worker.rb +6 -22
- data/test/test_builder.rb +9 -9
- data/test/test_file_dataset_registry.rb +2 -3
- data/test/test_file_worker.rb +4 -5
- data/test/test_football_dataset.rb +9 -9
- metadata +10 -37
- data/.gemtest +0 -0
- data/lib/datafile/builder2.rb +0 -90
- data/lib/datafile/workers/dataset.rb +0 -40
- data/lib/datafile/workers/zip/beer.rb +0 -18
- data/lib/datafile/workers/zip/football.rb +0 -18
- data/lib/datafile/workers/zip/world.rb +0 -18
- data/test/datafile2/at.rb +0 -51
- data/test/test_builder2.rb +0 -36
@@ -6,7 +6,7 @@
|
|
6
6
|
|
7
7
|
def read_known_datasets( path )
|
8
8
|
ary = []
|
9
|
-
lines = File.
|
9
|
+
lines = File.open( path, 'r:utf-8' ).read
|
10
10
|
lines.each_line do |line|
|
11
11
|
## skip blank and comments lines
|
12
12
|
next if /^\s*#/ =~ line || /^\s*$/ =~ line
|
@@ -27,13 +27,12 @@ class Dataset
|
|
27
27
|
@opts = opts
|
28
28
|
end
|
29
29
|
|
30
|
-
attr_reader :name
|
31
|
-
|
30
|
+
attr_reader :name, :opts
|
31
|
+
|
32
|
+
## convenience helpers for known opts
|
33
|
+
def setup() @opts[:setup]; end ## note: return nil if not found/set
|
34
|
+
def format() @opts[:format] || 'txt'; end ## note: assume default is txt (other formats incl. csv) for now - why? wh not?
|
32
35
|
|
33
|
-
def setup
|
34
|
-
value = @opts[:setup] || 'all'
|
35
|
-
"setups/#{value}"
|
36
|
-
end
|
37
36
|
|
38
37
|
def file? # note: use file? (not exit? might use zip? later to check if zip exists? -why? why not?)
|
39
38
|
## hack/convenience shortcut:
|
@@ -49,7 +48,7 @@ class Dataset
|
|
49
48
|
basename = parts[1]
|
50
49
|
## e.g.
|
51
50
|
## ./ (working folder) => at-austria
|
52
|
-
## openfootball/at-austria
|
51
|
+
## openfootball/at-austria
|
53
52
|
if File.basename( Dir.getwd ) == basename
|
54
53
|
puts " bingo!! working folder >#{basename}< matches dataset"
|
55
54
|
true ## return true
|
@@ -75,23 +74,18 @@ class WorldDataset < Dataset
|
|
75
74
|
|
76
75
|
super( name, opts ) ## todo/check: just juse super (e.g. pass along all params - why? why not?)
|
77
76
|
end
|
78
|
-
|
79
|
-
def zip_worker() WorldZipDataset.new( self ); end ## check: change (rename) just use zip or use worker_zip?? - why, why not?
|
80
|
-
def file_worker() WorldFileDataset.new( self ); end
|
81
77
|
end # class WorldDataset
|
82
78
|
|
83
79
|
|
84
80
|
|
85
81
|
class FootballDataset < Dataset
|
86
82
|
|
87
|
-
|
83
|
+
def self.build_known_datasets
|
84
|
+
read_known_datasets( "#{::Datafile.data_path}/football.txt" )
|
85
|
+
end
|
88
86
|
|
89
87
|
def self.known_datasets
|
90
|
-
|
91
|
-
### todo/fix - use \\= idiom - why, why not??
|
92
|
-
if @@known_football_datasets.nil?
|
93
|
-
@@known_football_datasets = read_known_datasets( "#{::Datafile.data_path}/football.txt" )
|
94
|
-
end
|
88
|
+
@@known_football_datasets ||= build_known_datasets
|
95
89
|
@@known_football_datasets
|
96
90
|
end
|
97
91
|
|
@@ -101,7 +95,11 @@ class FootballDataset < Dataset
|
|
101
95
|
## check if name include slash (e.g. /)
|
102
96
|
## - if not auto-add openfootball/ (default)
|
103
97
|
if name_easy.index( '/' ).nil?
|
104
|
-
|
98
|
+
if opts[:format] == 'csv'
|
99
|
+
name = "footballcsv/#{name_easy}"
|
100
|
+
else
|
101
|
+
name = "openfootball/#{name_easy}"
|
102
|
+
end
|
105
103
|
else
|
106
104
|
name = name_easy ## just pass through for now
|
107
105
|
end
|
@@ -109,14 +107,11 @@ class FootballDataset < Dataset
|
|
109
107
|
super( name, opts )
|
110
108
|
|
111
109
|
### check for known datasets; warn: if not known (might be a typo)
|
112
|
-
unless
|
110
|
+
unless self.class.known_datasets.include?( name )
|
113
111
|
## todo: use logger - why, why not??
|
114
112
|
puts "*** warn: unknown football dataset '#{name}', typo ???"
|
115
113
|
end
|
116
114
|
end
|
117
|
-
|
118
|
-
def zip_worker() FootballZipDataset.new( self ); end
|
119
|
-
def file_worker() FootballFileDataset.new( self ); end
|
120
115
|
end # class FootballDataset
|
121
116
|
|
122
117
|
|
@@ -133,9 +128,6 @@ class BeerDataset < Dataset
|
|
133
128
|
|
134
129
|
super( name, opts )
|
135
130
|
end
|
136
|
-
|
137
|
-
def zip_worker() BeerZipDataset.new( self ); end
|
138
|
-
def file_worker() BeerFileDataset.new( self ); end
|
139
131
|
end # class BeerDataset
|
140
132
|
|
141
133
|
|
data/lib/datafile/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
module Datafile
|
4
4
|
|
5
5
|
MAJOR = 0 ## todo: namespace inside version or something - why? why not??
|
6
|
-
MINOR =
|
7
|
-
PATCH =
|
6
|
+
MINOR = 3
|
7
|
+
PATCH = 0
|
8
8
|
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
9
9
|
|
10
10
|
def self.version
|
@@ -17,7 +17,10 @@ module Datafile
|
|
17
17
|
|
18
18
|
def self.root
|
19
19
|
"#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
|
20
|
-
end
|
20
|
+
end
|
21
21
|
|
22
|
-
|
22
|
+
def self.data_path
|
23
|
+
"#{root}/data"
|
24
|
+
end
|
23
25
|
|
26
|
+
end # module Datafile
|
@@ -2,28 +2,26 @@
|
|
2
2
|
|
3
3
|
module Datafile
|
4
4
|
|
5
|
-
class FileDataset
|
5
|
+
class FileDataset
|
6
6
|
## read dataset from file(system)
|
7
7
|
|
8
|
-
|
8
|
+
include LogUtils::Logging
|
9
|
+
|
9
10
|
|
10
11
|
def self.registry
|
11
|
-
|
12
|
-
if @@registry.nil?
|
13
|
-
@@registry = FileDatasetRegistry.new
|
14
|
-
end
|
12
|
+
@@registry ||= FileDatasetRegistry.new
|
15
13
|
@@registry
|
16
14
|
end
|
17
15
|
|
18
16
|
def initialize( dataset )
|
19
|
-
|
17
|
+
@dataset = dataset
|
20
18
|
end
|
21
19
|
|
22
|
-
def repo_dir ### check: use (rename to) include dir (or local_repo_dir) - why, why not ???
|
20
|
+
def repo_dir ### check: use (rename to) include dir (or local_repo_dir) - why, why not ???
|
23
21
|
## note: for easy testing allow "in situ" datasets
|
24
22
|
## e.g. ./ (e.g. mu-mauritius) is openfootball/mu-mauritius
|
25
23
|
## split name in org/user + project (e.g. openfootball/at-austria)
|
26
|
-
parts = name.split( '/' )
|
24
|
+
parts = @dataset.name.split( '/' )
|
27
25
|
|
28
26
|
basename = parts[1]
|
29
27
|
if File.basename( Dir.getwd ) == basename
|
@@ -31,63 +29,28 @@ class FileDataset < DatasetNode
|
|
31
29
|
return Dir.getwd ## assume working directory/folder is repo dir
|
32
30
|
end
|
33
31
|
|
34
|
-
registry.lookup( name )
|
32
|
+
registry.lookup( @dataset.name )
|
35
33
|
end
|
36
34
|
|
37
35
|
def dump
|
38
36
|
## for debuggin dump dataset -- todo (also check if folder exits ??)
|
39
|
-
puts "dataset '#{name}' opts=#{opts.
|
37
|
+
puts "dataset '#{@dataset.name}' opts=#{@dataset.opts.inspect}" ## use opts.inspect instead of to_json - why? why not?
|
40
38
|
puts " repo-dir '#{repo_dir}'"
|
41
39
|
end
|
42
40
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
end # class FileDataset
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
class FootballFileDataset < FileDataset
|
52
|
-
|
53
|
-
def initialize( dataset )
|
54
|
-
super( dataset )
|
55
|
-
end
|
56
|
-
|
57
|
-
def read()
|
58
|
-
logger.info( "read football-dataset (file) '#{name}', '#{setup}'" )
|
59
|
-
|
60
|
-
SportDb.read_setup( setup, repo_dir )
|
61
|
-
end
|
62
|
-
end # class FootballFileDataset
|
63
|
-
|
41
|
+
def read
|
42
|
+
if @dataset.is_a?( FootballDataset )
|
43
|
+
logger.info( "read football dataset (file) '#{@dataset.name}', '#{@dataset.setup}'" )
|
64
44
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
def read()
|
72
|
-
logger.info( "read world-dataset (file) '#{name}', '#{setup}'" )
|
73
|
-
|
74
|
-
## WorldDb.read_setup( 'setups/countries', WORLD_DB_INCLUDE_PATH, skip_tags: true )
|
75
|
-
WorldDb.read_setup( setup, repo_dir, skip_tags: true )
|
76
|
-
end
|
77
|
-
end # class WorldFileDataset
|
78
|
-
|
79
|
-
class BeerFileDataset < FileDataset
|
80
|
-
|
81
|
-
def initialize( dataset )
|
82
|
-
super( dataset )
|
83
|
-
end
|
84
|
-
|
85
|
-
def read()
|
86
|
-
logger.info( "read beer-dataset (file) '#{name}', '#{setup}'" )
|
87
|
-
|
88
|
-
BeerDb.read_setup( setup, repo_dir )
|
45
|
+
pack = SportDb::DirPackage.new( repo_dir )
|
46
|
+
pack.read( season: @dataset.setup ) ## note: pass on (optional) setup arg as season (filter) arg for now
|
47
|
+
else
|
48
|
+
logger.info( "TODO/FIX: read dataset (file) '#{@dataset.name}', '#{@dataset.setup}'; sorry" )
|
49
|
+
end
|
89
50
|
end
|
90
|
-
end # class BeerFileDataset
|
91
51
|
|
92
52
|
|
53
|
+
private
|
54
|
+
def registry() self.class.registry; end ## convenience method to access "static" shared class variable
|
55
|
+
end # class FileDataset
|
93
56
|
end # module Datafile
|
@@ -15,22 +15,30 @@ class FileDatasetRegistry
|
|
15
15
|
@roots[:openmundi] = '../../openmundi' ## OPENMUNDI_ROOT = "../../openmundi"
|
16
16
|
@roots[:openfootball] = '..' ## OPENFOOTBALL_ROOT = ".."
|
17
17
|
@roots[:openbeer] = '..'
|
18
|
+
|
19
|
+
@roots[:footballcsv] = '..'
|
18
20
|
end
|
19
21
|
|
20
22
|
def merge( hash )
|
21
23
|
## todo: add support for merging project mappings too
|
22
24
|
## use merge_roots and merge_projects ?? why, why not??
|
23
|
-
|
24
25
|
@roots = @roots.merge( hash )
|
25
26
|
end
|
26
27
|
|
27
|
-
def lookup( name )
|
28
|
-
|
28
|
+
def lookup( name )
|
29
|
+
path, _ = lookup_path( name ) ## note: ignore error message passed along in return
|
30
|
+
path
|
31
|
+
end
|
32
|
+
|
33
|
+
def lookup!( name )
|
34
|
+
path, error = lookup_path( name )
|
35
|
+
raise error if error
|
36
|
+
path
|
37
|
+
end
|
29
38
|
|
30
|
-
private
|
31
|
-
def lookup_worker( name, fail_on_error )
|
32
|
-
### fix: use lookup! version for exption and lookup (w/ returning nil) - why, why not??
|
33
39
|
|
40
|
+
private
|
41
|
+
def lookup_path( name )
|
34
42
|
## split name in org/user + project (e.g. openfootball/at-austria)
|
35
43
|
parts = name.split( '/' )
|
36
44
|
## check/todo: assert parts == 2 -- why, why not??
|
@@ -38,11 +46,7 @@ private
|
|
38
46
|
if root.nil?
|
39
47
|
msg = "no mapping found for '#{parts[0]}' in '#{name}'"
|
40
48
|
logger.error( msg )
|
41
|
-
|
42
|
-
raise DatasetNotFoundError.new( msg ) ## throw exception FileNotFound / DatasetNotFound ??
|
43
|
-
else
|
44
|
-
return nil
|
45
|
-
end
|
49
|
+
return [nil, DatasetNotFoundError.new( msg )] ## throw exception FileNotFound / DatasetNotFound ??
|
46
50
|
end
|
47
51
|
|
48
52
|
path = "#{root}/#{parts[1]}"
|
@@ -50,16 +54,12 @@ private
|
|
50
54
|
unless File.exist?( path )
|
51
55
|
msg = "no file found for '#{name}'; expected '#{path}'"
|
52
56
|
logger.error( msg )
|
53
|
-
|
54
|
-
raise DatasetNotFoundError.new( msg ) ## throw exception FileNotFound / DatasetNotFound ??
|
55
|
-
else
|
56
|
-
return nil
|
57
|
-
end
|
57
|
+
return [nil, DatasetNotFoundError.new( msg )] ## throw exception FileNotFound / DatasetNotFound ??
|
58
58
|
end
|
59
|
+
|
59
60
|
### check for File.directory?( path ) too - why, why not???
|
60
|
-
path
|
61
|
+
[path, nil] ## use go-style returns with error as second argument (as error as value)
|
61
62
|
end
|
62
|
-
|
63
63
|
end # class FileDatasetRegistry
|
64
64
|
|
65
65
|
end # module Datafile
|
@@ -15,38 +15,18 @@ class FileWorker ## check: rename to FileDatafileWorker?? or FileDatafile -wh
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def read
|
18
|
-
## note: also run inlines (setup script) before
|
19
|
-
@datafile.inlines.each do |inline|
|
20
|
-
inline.call
|
21
|
-
end
|
22
|
-
|
23
18
|
@datafile.datasets.each do |dataset|
|
24
|
-
dataset
|
25
|
-
|
26
|
-
end
|
27
|
-
|
28
|
-
def calc
|
29
|
-
@datafile.scripts.each do |script|
|
30
|
-
script.call
|
19
|
+
f = FileDataset.new( dataset )
|
20
|
+
f.read
|
31
21
|
end
|
32
22
|
end
|
33
23
|
|
34
24
|
def dump
|
35
|
-
## also dump inlines
|
36
|
-
@datafile.inlines.each do |inline|
|
37
|
-
inline.dump
|
38
|
-
end
|
39
|
-
|
40
25
|
@datafile.datasets.each do |dataset|
|
41
|
-
dataset
|
42
|
-
|
43
|
-
|
44
|
-
## also dump scripts
|
45
|
-
@datafile.scripts.each do |script|
|
46
|
-
script.dump
|
26
|
+
f = FileDataset.new( dataset )
|
27
|
+
f.dump
|
47
28
|
end
|
48
29
|
end
|
49
30
|
|
50
31
|
end # class FileWorker
|
51
|
-
|
52
32
|
end # module Datafile
|
@@ -3,23 +3,23 @@
|
|
3
3
|
module Datafile
|
4
4
|
|
5
5
|
|
6
|
-
class ZipDataset
|
6
|
+
class ZipDataset ### use (rename to) ZipDatasetWorker/Helper/Wrapper/Fetcher/Downloader - why, why not ???
|
7
7
|
## read dataset from zip(archive)
|
8
8
|
|
9
|
+
include LogUtils::Logging
|
10
|
+
|
9
11
|
def initialize( dataset )
|
10
|
-
|
12
|
+
@dataset = dataset
|
11
13
|
end
|
12
14
|
|
13
15
|
def remote_zip_url # remote zip url
|
14
|
-
|
15
|
-
## "https://github.com/#{@name}/archive/master.zip"
|
16
|
-
"http://github.com/#{name}/archive/master.zip"
|
16
|
+
"https://github.com/#{@dataset.name}/archive/master.zip"
|
17
17
|
end
|
18
18
|
|
19
19
|
def local_zip_name
|
20
20
|
### note: replace / in name w/ --I--
|
21
21
|
## e.g. flatten the filename, that is, do NOT include any folders
|
22
|
-
name.gsub('/', '--I--') # note: will NOT include/return .zip extension
|
22
|
+
@dataset.name.gsub('/', '--I--') # note: will NOT include/return .zip extension
|
23
23
|
end
|
24
24
|
|
25
25
|
def local_zip_root
|
@@ -32,7 +32,7 @@ class ZipDataset < DatasetNode ### use(rename to) ZipDatasetWorker - why, why n
|
|
32
32
|
|
33
33
|
|
34
34
|
def download
|
35
|
-
logger.info( "download dataset '#{name}'" )
|
35
|
+
logger.info( "download dataset '#{@dataset.name}'" )
|
36
36
|
logger.info( " from '#{remote_zip_url}'" )
|
37
37
|
logger.info( " to '#{local_zip_path}'..." )
|
38
38
|
|
@@ -42,7 +42,7 @@ class ZipDataset < DatasetNode ### use(rename to) ZipDatasetWorker - why, why n
|
|
42
42
|
|
43
43
|
def dump
|
44
44
|
## for debuggin dump dataset (also check if zip exits)
|
45
|
-
puts "dataset '#{name}' opts=#{opts.to_json}" ## use opts.inspect instead of to_json - why? why not?
|
45
|
+
puts "dataset '#{@dataset.name}' opts=#{@dataset.opts.to_json}" ## use opts.inspect instead of to_json - why? why not?
|
46
46
|
puts " local '#{local_zip_name}' (#{local_zip_path})"
|
47
47
|
if File.exist?( local_zip_path )
|
48
48
|
puts " size: #{File.size(local_zip_path)} bytes"
|
@@ -52,6 +52,17 @@ class ZipDataset < DatasetNode ### use(rename to) ZipDatasetWorker - why, why n
|
|
52
52
|
puts " remote '#{remote_zip_url}'"
|
53
53
|
end
|
54
54
|
|
55
|
+
def read
|
56
|
+
if @dataset.is_a?( FootballDataset )
|
57
|
+
logger.info( "read football dataset (zip) '#{@dataset.name}', '#{@dataset.setup}'" )
|
58
|
+
|
59
|
+
pack = SportDb::ZipPackage.new( local_zip_path )
|
60
|
+
pack.read( season: @dataset.setup ) ## note: pass on (optional) setup arg as season (filter) arg for now
|
61
|
+
else
|
62
|
+
logger.info( "TODO/FIX: read dataset (zip) '#{@dataset.name}', '#{@dataset.setup}'; sorry" )
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
55
66
|
|
56
67
|
private
|
57
68
|
####
|
@@ -12,38 +12,22 @@ class ZipWorker ## check: rename to ZipDatafileWorker?? or ZipDatafile -why,
|
|
12
12
|
|
13
13
|
def download
|
14
14
|
@datafile.datasets.each do |dataset|
|
15
|
-
dataset
|
15
|
+
z = ZipDataset.new( dataset )
|
16
|
+
z.download
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
19
20
|
def read
|
20
|
-
## note: also run inlines (setup script) before
|
21
|
-
@datafile.inlines.each do |inline|
|
22
|
-
inline.call
|
23
|
-
end
|
24
|
-
|
25
21
|
@datafile.datasets.each do |dataset|
|
26
|
-
dataset
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
def calc
|
31
|
-
@datafile.scripts.each do |script|
|
32
|
-
script.call
|
22
|
+
z = ZipDataset.new( dataset )
|
23
|
+
z.read
|
33
24
|
end
|
34
25
|
end
|
35
26
|
|
36
27
|
def dump
|
37
|
-
## also dump inlines
|
38
|
-
@datafile.inlines.each do |inline|
|
39
|
-
inline.dump
|
40
|
-
end
|
41
28
|
@datafile.datasets.each do |dataset|
|
42
|
-
dataset
|
43
|
-
|
44
|
-
## also dump scripts
|
45
|
-
@datafile.scripts.each do |script|
|
46
|
-
script.dump
|
29
|
+
z = ZipDataset.new( dataset )
|
30
|
+
z.dump
|
47
31
|
end
|
48
32
|
end
|
49
33
|
|