red-datasets 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +86 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1cfd18b589e4624178d9010ef68a100bb6e2573ccf18a9f96168af786523578
|
4
|
+
data.tar.gz: 67eddd22e10bf78c0b2cf10b18de289368d473d7b5ddf2a557cc2264834e32b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 111243d3a1d3d758196bb71301ccb0f34beb1f5bec7c5c14b15f7c96fd6bdde924e30d90d3ace9e9258074411c9f7e7b4ef6bd9338dc5c11349534b2392f6f81
|
7
|
+
data.tar.gz: 9a9b426c753bd7e6cc12d452d61b90c2422fcad3b3c353a552c5c05a7c7fd53c3d4ac9cec2e33af1537d9e76e04f1df3d6d9b4baf043528fdde2ab4f9f203e9f
|
data/README.md
CHANGED
@@ -17,15 +17,30 @@ You can use datasets easily because you can access each dataset with multiple wa
|
|
17
17
|
|
18
18
|
## Available datasets
|
19
19
|
|
20
|
-
TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
|
21
|
-
|
22
20
|
* Adult Dataset
|
21
|
+
* Aozora Bunko
|
22
|
+
* California Housing
|
23
23
|
* CIFAR-10 Dataset
|
24
24
|
* CIFAR-100 Dataset
|
25
|
+
* CLDR language plural rules
|
26
|
+
* Communities and crime
|
27
|
+
* Diamonds Dataset
|
28
|
+
* E-Stat Japan
|
25
29
|
* Fashion-MNIST
|
30
|
+
* Fuel Economy Dataset
|
31
|
+
* Geolonia Japanese Addresses
|
32
|
+
* Hepatitis
|
26
33
|
* Iris Dataset
|
34
|
+
* Libsvm
|
27
35
|
* MNIST database
|
36
|
+
* Mushroom
|
37
|
+
* Penguins
|
28
38
|
* The Penn Treebank Project
|
39
|
+
* PMJT - Pre-Modern Japanese Text dataset list
|
40
|
+
* Postal Codes in Japan
|
41
|
+
* Rdatasets
|
42
|
+
* Seaborn
|
43
|
+
* Sudachi Synonym Dictionary
|
29
44
|
* Wikipedia
|
30
45
|
* Wine Dataset
|
31
46
|
|
@@ -135,6 +150,12 @@ end
|
|
135
150
|
|
136
151
|
* [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
|
137
152
|
|
153
|
+
## How to develop Red Datasets
|
154
|
+
1. Fork https://github.com/red-data-tools/red-datasets
|
155
|
+
2. Create a feature branch from master
|
156
|
+
3. Develop in the feature branch
|
157
|
+
4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
|
158
|
+
|
138
159
|
## License
|
139
160
|
|
140
161
|
The MIT license. See `LICENSE.txt` for details.
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,91 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.5 - 2022-09-22
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::PMJTDatasetList`: Added.
|
8
|
+
[GitHub#107][Patch by okadak]
|
9
|
+
|
10
|
+
* `Datasets::AozoraBunko`: Added.
|
11
|
+
[GitHub#108][Patch by Masa]
|
12
|
+
|
13
|
+
* Added how to develop to README
|
14
|
+
[GitHub#117][Patch by abcdefg-1234567]
|
15
|
+
|
16
|
+
* `Datasets::FuelEconomy`: Added.
|
17
|
+
[GitHub#114][Patch by Benson Muite]
|
18
|
+
|
19
|
+
* `Datasets::Geolonia`: Added.
|
20
|
+
[GitHub#118][Patch by abcdefg-1234567]
|
21
|
+
|
22
|
+
* `Datasets::Diamonds`: Added.
|
23
|
+
[GitHub#110][Patch by Benson Muite]
|
24
|
+
|
25
|
+
* `Datasets::ITACorpus`: Added.
|
26
|
+
[GitHub#119][Patch by abcdefg-1234567]
|
27
|
+
|
28
|
+
* `Datasets::KuzushijiMNIST`: Added.
|
29
|
+
[GitHub#125][Patch by abcdefg-1234567]
|
30
|
+
|
31
|
+
* Updated list of datasets in README.
|
32
|
+
[GitHub#129][Patch by Benson Muite]
|
33
|
+
|
34
|
+
* `Datasets::CaliforniaHousing`: Added.
|
35
|
+
[GitHub#123][Patch by Benson Muite]
|
36
|
+
|
37
|
+
* Added support for Ruby 3.1.
|
38
|
+
[GitHub#130][Patch by Benson Muite]
|
39
|
+
|
40
|
+
* `Datasets::AFINN`: Added.
|
41
|
+
[GitHub#120][Patch by Benson Muite]
|
42
|
+
|
43
|
+
* `Datasets::LivedoorNews`: Added.
|
44
|
+
[GitHub#127][Patch by abcdefg-1234567]
|
45
|
+
|
46
|
+
* `Datasets::SeabornDataList`: Added.
|
47
|
+
[GitHub#134][Patch by Hirokazu SUZUKI]
|
48
|
+
|
49
|
+
* `Datasets::WikipediaKyotoJapaneseEnglish`: Added.
|
50
|
+
[GitHub#135][Patch by abcdefg-1234567]
|
51
|
+
|
52
|
+
* Renamed Rdatasets to Rdataset.
|
53
|
+
[GitHub#148][Patch by Hirokazu SUZUKI]
|
54
|
+
|
55
|
+
* Removed support for Ruby 2.6.
|
56
|
+
|
57
|
+
* Add missing license information.
|
58
|
+
|
59
|
+
* `Datasets::QuoraDuplicateQuestionPair`: Added.
|
60
|
+
[GitHub#149][Patch by otegami]
|
61
|
+
|
62
|
+
### Fixes
|
63
|
+
|
64
|
+
* Fixed key from nil to :index in `Datasets::SeabornData`.
|
65
|
+
[GitHub#133][Patch by Hirokazu SUZUKI]
|
66
|
+
|
67
|
+
* Fixed `Datasets::Rdatasets#each` to change "NA" to nil.
|
68
|
+
[GitHub#139][Patch by Hirokazu SUZUKI]
|
69
|
+
|
70
|
+
* Fix `Datasets::Rdatasets#each` with mixed data of numeric and string.
|
71
|
+
[GitHub#140][Patch by Hirokazu SUZUKI]
|
72
|
+
|
73
|
+
### Thanks
|
74
|
+
|
75
|
+
* okadak
|
76
|
+
|
77
|
+
* Masa
|
78
|
+
|
79
|
+
* Benson Muite
|
80
|
+
|
81
|
+
* abcdefg-1234567
|
82
|
+
|
83
|
+
* Hirokazu SUZUKI
|
84
|
+
|
85
|
+
* Sutou Kouhei
|
86
|
+
|
87
|
+
* otegami
|
88
|
+
|
3
89
|
## 0.1.4 - 2021-07-13
|
4
90
|
|
5
91
|
### Improvements
|
data/lib/datasets/adult.rb
CHANGED
@@ -31,7 +31,8 @@ module Datasets
|
|
31
31
|
@type = type
|
32
32
|
@metadata.id = "adult-#{@type}"
|
33
33
|
@metadata.name = "Adult: #{@type}"
|
34
|
-
@metadata.url = "
|
34
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
|
35
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
35
36
|
@metadata.description = lambda do
|
36
37
|
read_names
|
37
38
|
end
|
@@ -58,10 +59,8 @@ module Datasets
|
|
58
59
|
ext = "test"
|
59
60
|
end
|
60
61
|
data_path = cache_dir_path + "adult-#{ext}.csv"
|
61
|
-
|
62
|
-
|
63
|
-
download(data_path, data_url)
|
64
|
-
end
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
|
63
|
+
download(data_path, data_url)
|
65
64
|
|
66
65
|
options = {
|
67
66
|
converters: [:numeric, lambda {|f| f.strip}],
|
@@ -74,10 +73,8 @@ module Datasets
|
|
74
73
|
|
75
74
|
def read_names
|
76
75
|
names_path = cache_dir_path + "adult.names"
|
77
|
-
|
78
|
-
|
79
|
-
download(names_path, names_url)
|
80
|
-
end
|
76
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
|
77
|
+
download(names_path, names_url)
|
81
78
|
names_path.read
|
82
79
|
end
|
83
80
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require "csv"
|
2
|
+
require_relative "zip-extractor"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class AFINN < Dataset
|
6
|
+
Record = Struct.new(:word,
|
7
|
+
:valence)
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super()
|
11
|
+
@metadata.id = "afinn"
|
12
|
+
@metadata.name = "AFINN"
|
13
|
+
@metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
|
14
|
+
@metadata.licenses = ["ODbL-1.0"]
|
15
|
+
@metadata.description = lambda do
|
16
|
+
extract_file("AFINN/AFINN-README.txt") do |input|
|
17
|
+
readme = input.read
|
18
|
+
readme.force_encoding("UTF-8")
|
19
|
+
readme.
|
20
|
+
gsub(/^AFINN-96:.*?\n\n/m, "").
|
21
|
+
gsub(/^In Python.*$/m, "").
|
22
|
+
strip
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
return to_enum(__method__) unless block_given?
|
29
|
+
|
30
|
+
extract_file("AFINN/AFINN-111.txt") do |input|
|
31
|
+
csv = CSV.new(input, col_sep: "\t", converters: :numeric)
|
32
|
+
csv.each do |row|
|
33
|
+
yield(Record.new(*row))
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def extract_file(file_path, &block)
|
40
|
+
data_path = cache_dir_path + "imm6010.zip"
|
41
|
+
data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
|
42
|
+
download(data_path, data_url)
|
43
|
+
|
44
|
+
extractor = ZipExtractor.new(data_path)
|
45
|
+
extractor.extract_file(file_path, &block)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
# Dataset for AozoraBunko
|
6
|
+
class AozoraBunko < Dataset
|
7
|
+
Book = Struct.new(
|
8
|
+
# 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
|
9
|
+
:title_id,
|
10
|
+
:title,
|
11
|
+
:title_reading,
|
12
|
+
:title_reading_collation,
|
13
|
+
:subtitle,
|
14
|
+
:subtitle_reading,
|
15
|
+
:original_title,
|
16
|
+
:first_appearance,
|
17
|
+
:ndc_code, # 分類番号(日本十進分類法の番号)
|
18
|
+
:syllabary_spelling_type,
|
19
|
+
:copyrighted,
|
20
|
+
:published_date,
|
21
|
+
:last_updated_date,
|
22
|
+
:detail_url,
|
23
|
+
# 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
|
24
|
+
:person_id,
|
25
|
+
:person_family_name,
|
26
|
+
:person_first_name,
|
27
|
+
:person_family_name_reading,
|
28
|
+
:person_first_name_reading,
|
29
|
+
:person_family_name_reading_collation,
|
30
|
+
:person_first_name_reading_collation,
|
31
|
+
:person_family_name_romaji,
|
32
|
+
:person_first_name_romaji,
|
33
|
+
:person_type,
|
34
|
+
:person_birthday,
|
35
|
+
:person_date_of_death,
|
36
|
+
:person_copyrighted,
|
37
|
+
# 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
|
38
|
+
:original_book_name1,
|
39
|
+
:original_book_publisher_name1,
|
40
|
+
:original_book_first_published_date1,
|
41
|
+
:used_version_for_registration1,
|
42
|
+
:used_version_for_proofreading1,
|
43
|
+
:base_of_original_book_name1,
|
44
|
+
:base_of_original_book_publisher_name1,
|
45
|
+
:base_of_original_book_first_published_date1,
|
46
|
+
# 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
|
47
|
+
:original_book_name2,
|
48
|
+
:original_book_publisher_name2,
|
49
|
+
:original_book_first_published_date2,
|
50
|
+
:used_version_for_registration2,
|
51
|
+
:used_version_for_proofreading2,
|
52
|
+
:base_of_original_book_name2,
|
53
|
+
:base_of_original_book_publisher_name2,
|
54
|
+
:base_of_original_book_first_published_date2,
|
55
|
+
# 入力者,校正者,
|
56
|
+
:registered_person_name,
|
57
|
+
:proofreader_name,
|
58
|
+
# テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
|
59
|
+
:text_file_url,
|
60
|
+
:last_text_file_updated_date,
|
61
|
+
:text_file_character_encoding,
|
62
|
+
:text_file_character_set,
|
63
|
+
:text_file_updating_count,
|
64
|
+
# XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
|
65
|
+
:html_file_url,
|
66
|
+
:last_html_file_updated_date,
|
67
|
+
:html_file_character_encoding,
|
68
|
+
:html_file_character_set,
|
69
|
+
:html_file_updating_count
|
70
|
+
)
|
71
|
+
|
72
|
+
class Book
|
73
|
+
attr_writer :cache_path
|
74
|
+
|
75
|
+
def initialize(*args)
|
76
|
+
super
|
77
|
+
@text = nil
|
78
|
+
@html = nil
|
79
|
+
@cache_path = nil
|
80
|
+
end
|
81
|
+
|
82
|
+
alias_method :copyrighted?, :copyrighted
|
83
|
+
alias_method :person_copyrighted?, :person_copyrighted
|
84
|
+
|
85
|
+
def text
|
86
|
+
return @text unless @text.nil?
|
87
|
+
return @text if text_file_url.nil? || text_file_url.empty?
|
88
|
+
|
89
|
+
# when url is not zip file, it needs to open web page by brower and has to download
|
90
|
+
# e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
|
91
|
+
return @text unless text_file_url.end_with?('.zip')
|
92
|
+
|
93
|
+
downloader = Downloader.new(text_file_url)
|
94
|
+
downloader.download(text_file_output_path)
|
95
|
+
|
96
|
+
@text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
|
97
|
+
input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
|
98
|
+
end
|
99
|
+
|
100
|
+
@text
|
101
|
+
end
|
102
|
+
|
103
|
+
def html
|
104
|
+
return @html unless @html.nil?
|
105
|
+
return @html if html_file_url.nil? || html_file_url.empty?
|
106
|
+
|
107
|
+
downloader = Downloader.new(html_file_url)
|
108
|
+
downloader.download(html_file_output_path)
|
109
|
+
@html = File.read(html_file_output_path).encode(Encoding::UTF_8,
|
110
|
+
normalize_encoding(html_file_character_encoding))
|
111
|
+
|
112
|
+
@html
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def text_file_output_path
|
118
|
+
cache_base_dir + text_file_name
|
119
|
+
end
|
120
|
+
|
121
|
+
def html_file_output_path
|
122
|
+
cache_base_dir + html_file_name
|
123
|
+
end
|
124
|
+
|
125
|
+
def text_file_name
|
126
|
+
text_file_url.split('/').last
|
127
|
+
end
|
128
|
+
|
129
|
+
def html_file_name
|
130
|
+
html_file_url.split('/').last
|
131
|
+
end
|
132
|
+
|
133
|
+
def cache_base_dir
|
134
|
+
@cache_path.base_dir + title_id + person_id
|
135
|
+
end
|
136
|
+
|
137
|
+
def normalize_encoding(encoding)
|
138
|
+
case encoding
|
139
|
+
when 'ShiftJIS'
|
140
|
+
Encoding::Shift_JIS
|
141
|
+
when 'UTF-8'
|
142
|
+
Encoding::UTF_8
|
143
|
+
else
|
144
|
+
encoding
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def initialize
|
150
|
+
super()
|
151
|
+
|
152
|
+
@metadata.id = 'aozora-bunko'
|
153
|
+
@metadata.name = 'Aozora Bunko'
|
154
|
+
@metadata.url = 'https://www.aozora.gr.jp/'
|
155
|
+
@metadata.licenses = 'CC-BY-2.1-JP'
|
156
|
+
@metadata.description = <<~DESCRIPTION
|
157
|
+
Aozora Bunko is an activity to collect free electronic books that anyone can access
|
158
|
+
on the Internet like a library. The copyrighted works and the works that are said to be
|
159
|
+
"free to read" are available after being digitized in text and XHTML (some HTML) formats.
|
160
|
+
DESCRIPTION
|
161
|
+
end
|
162
|
+
|
163
|
+
def each
|
164
|
+
return to_enum(__method__) unless block_given?
|
165
|
+
|
166
|
+
open_data do |csv_file_stream|
|
167
|
+
text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
|
168
|
+
|
169
|
+
CSV.parse(text, headers: true) do |row|
|
170
|
+
%w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
|
171
|
+
row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
|
172
|
+
end
|
173
|
+
book = Book.new(*row.fields)
|
174
|
+
book.cache_path = cache_path
|
175
|
+
|
176
|
+
yield(book)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
private
|
182
|
+
|
183
|
+
def open_data(&block)
|
184
|
+
data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
|
185
|
+
data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
|
186
|
+
download(data_path, data_url)
|
187
|
+
ZipExtractor.new(data_path).extract_first_file do |input|
|
188
|
+
block.call(input)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def normalize_boolean(column_value)
|
193
|
+
column_value == 'あり'
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datasets
|
2
|
+
class CachePath
|
3
|
+
def initialize(id)
|
4
|
+
@id = id
|
5
|
+
end
|
6
|
+
|
7
|
+
def base_dir
|
8
|
+
Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
|
9
|
+
end
|
10
|
+
|
11
|
+
def remove
|
12
|
+
FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def system_cache_dir
|
18
|
+
case RUBY_PLATFORM
|
19
|
+
when /mswin/, /mingw/
|
20
|
+
ENV['LOCALAPPDATA'] || '~/AppData/Local'
|
21
|
+
when /darwin/
|
22
|
+
'~/Library/Caches'
|
23
|
+
else
|
24
|
+
ENV['XDG_CACHE_HOME'] || '~/.cache'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require "csv"
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class CaliforniaHousing < Dataset
|
6
|
+
Record = Struct.new(:median_house_value,
|
7
|
+
:median_income,
|
8
|
+
:housing_median_age,
|
9
|
+
:total_rooms,
|
10
|
+
:total_bedrooms,
|
11
|
+
:population,
|
12
|
+
:households,
|
13
|
+
:latitude,
|
14
|
+
:longitude)
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
super()
|
18
|
+
@metadata.id = "california-housing"
|
19
|
+
@metadata.name = "California Housing"
|
20
|
+
@metadata.url = "http://lib.stat.cmu.edu/datasets/"
|
21
|
+
@metadata.licenses = ["CCO"]
|
22
|
+
@metadata.description = <<-DESCRIPTION
|
23
|
+
Housing information from the 1990 census used in
|
24
|
+
Pace, R. Kelley and Ronald Barry,
|
25
|
+
"Sparse Spatial Autoregressions",
|
26
|
+
Statistics and Probability Letters, 33 (1997) 291-297.
|
27
|
+
Available from http://lib.stat.cmu.edu/datasets/.
|
28
|
+
DESCRIPTION
|
29
|
+
end
|
30
|
+
|
31
|
+
def each
|
32
|
+
return to_enum(__method__) unless block_given?
|
33
|
+
|
34
|
+
data_path = cache_dir_path + "houses.zip"
|
35
|
+
data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
|
36
|
+
file_name = "cadata.txt"
|
37
|
+
download(data_path, data_url)
|
38
|
+
open_data(data_path, file_name) do |input|
|
39
|
+
data = ""
|
40
|
+
input.each_line do |line|
|
41
|
+
next unless line.start_with?(" ")
|
42
|
+
data << line.lstrip.gsub(/ +/, ",")
|
43
|
+
end
|
44
|
+
options = {
|
45
|
+
converters: [:numeric],
|
46
|
+
}
|
47
|
+
CSV.parse(data, **options) do |row|
|
48
|
+
yield(Record.new(*row))
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
def open_data(data_path, file_name)
|
55
|
+
ZipExtractor.new(data_path).extract_first_file do |input|
|
56
|
+
yield input
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/datasets/cifar.rb
CHANGED
@@ -50,10 +50,8 @@ module Datasets
|
|
50
50
|
return to_enum(__method__) unless block_given?
|
51
51
|
|
52
52
|
data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
|
53
|
-
|
54
|
-
|
55
|
-
download(data_path, data_url)
|
56
|
-
end
|
53
|
+
data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
|
54
|
+
download(data_path, data_url)
|
57
55
|
|
58
56
|
parse_data(data_path, &block)
|
59
57
|
end
|
@@ -42,10 +42,8 @@ module Datasets
|
|
42
42
|
private
|
43
43
|
def open_data
|
44
44
|
data_path = cache_dir_path + "plurals.xml"
|
45
|
-
|
46
|
-
|
47
|
-
end
|
48
|
-
::File.open(data_path) do |input|
|
45
|
+
download(data_path, @metadata.url)
|
46
|
+
data_path.open do |input|
|
49
47
|
yield(input)
|
50
48
|
end
|
51
49
|
end
|
data/lib/datasets/communities.rb
CHANGED
@@ -140,6 +140,7 @@ module Datasets
|
|
140
140
|
@metadata.id = "communities"
|
141
141
|
@metadata.name = "Communities"
|
142
142
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
|
143
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
143
144
|
@metadata.description = lambda do
|
144
145
|
read_names
|
145
146
|
end
|
@@ -177,10 +178,8 @@ module Datasets
|
|
177
178
|
|
178
179
|
def open_data
|
179
180
|
data_path = cache_dir_path + "communities.data"
|
180
|
-
|
181
|
-
|
182
|
-
download(data_path, data_url)
|
183
|
-
end
|
181
|
+
data_url = "#{base_url}/communities.data"
|
182
|
+
download(data_path, data_url)
|
184
183
|
CSV.open(data_path) do |csv|
|
185
184
|
yield(csv)
|
186
185
|
end
|
@@ -188,10 +187,8 @@ module Datasets
|
|
188
187
|
|
189
188
|
def read_names
|
190
189
|
names_path = cache_dir_path + "communities.names"
|
191
|
-
|
192
|
-
|
193
|
-
download(names_path, names_url)
|
194
|
-
end
|
190
|
+
names_url = "#{base_url}/communities.names"
|
191
|
+
download(names_path, names_url)
|
195
192
|
names_path.read
|
196
193
|
end
|
197
194
|
end
|
data/lib/datasets/dataset.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "pathname"
|
2
2
|
|
3
|
+
require_relative "cache-path"
|
3
4
|
require_relative "downloader"
|
4
5
|
require_relative "error"
|
5
6
|
require_relative "metadata"
|
@@ -19,22 +20,17 @@ module Datasets
|
|
19
20
|
end
|
20
21
|
|
21
22
|
def clear_cache!
|
22
|
-
|
23
|
-
FileUtils.rmtree(cache_dir_path.to_s, secure: true)
|
24
|
-
end
|
23
|
+
cache_path.remove
|
25
24
|
end
|
26
25
|
|
27
26
|
private
|
27
|
+
|
28
28
|
def cache_dir_path
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
else
|
35
|
-
base_dir = ENV["XDG_CACHE_HOME"] || "~/.cache"
|
36
|
-
end
|
37
|
-
Pathname(base_dir).expand_path + "red-datasets" + metadata.id
|
29
|
+
cache_path.base_dir
|
30
|
+
end
|
31
|
+
|
32
|
+
def cache_path
|
33
|
+
@cache_path ||= CachePath.new(@metadata.id)
|
38
34
|
end
|
39
35
|
|
40
36
|
def download(output_path, url)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative "ggplot2-dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class Diamonds < Ggplot2Dataset
|
5
|
+
Record = Struct.new(:carat,
|
6
|
+
:cut,
|
7
|
+
:color,
|
8
|
+
:clarity,
|
9
|
+
:depth,
|
10
|
+
:table,
|
11
|
+
:price,
|
12
|
+
:x,
|
13
|
+
:y,
|
14
|
+
:z)
|
15
|
+
|
16
|
+
def initialize()
|
17
|
+
super("diamonds")
|
18
|
+
@metadata.id = "diamonds"
|
19
|
+
@metadata.name = "Diamonds"
|
20
|
+
@metadata.licenses = ["CC0-1.0"]
|
21
|
+
end
|
22
|
+
|
23
|
+
COLUMN_NAME_MAPPING = {
|
24
|
+
}
|
25
|
+
end
|
26
|
+
end
|
data/lib/datasets/downloader.rb
CHANGED
@@ -23,9 +23,14 @@ module Datasets
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def download(output_path)
|
26
|
+
return if output_path.exist?
|
27
|
+
|
26
28
|
output_path.parent.mkpath
|
27
29
|
|
28
|
-
headers = {
|
30
|
+
headers = {
|
31
|
+
"Accept-Encoding" => "identity",
|
32
|
+
"User-Agent" => "Red Datasets/#{VERSION}",
|
33
|
+
}
|
29
34
|
start = nil
|
30
35
|
partial_output_path = Pathname.new("#{output_path}.partial")
|
31
36
|
if partial_output_path.exist?
|
@@ -74,6 +74,7 @@ module Datasets
|
|
74
74
|
@metadata.id = "e-stat-japan-#{@api_version}"
|
75
75
|
@metadata.name = "e-Stat API #{@api_version}"
|
76
76
|
@metadata.url = @base_url
|
77
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
77
78
|
@metadata.description = "e-Stat API #{@api_version}"
|
78
79
|
|
79
80
|
@id = id
|
@@ -214,7 +215,7 @@ module Datasets
|
|
214
215
|
# even if error happens dispite of its error mapping.
|
215
216
|
# So we can't avoid caching retrieved response from the api.
|
216
217
|
# ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
|
217
|
-
download(@data_path, @url.to_s)
|
218
|
+
download(@data_path, @url.to_s)
|
218
219
|
end
|
219
220
|
|
220
221
|
def index_data
|