red-datasets 0.1.3 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +92 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +8 -12
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1cfd18b589e4624178d9010ef68a100bb6e2573ccf18a9f96168af786523578
|
4
|
+
data.tar.gz: 67eddd22e10bf78c0b2cf10b18de289368d473d7b5ddf2a557cc2264834e32b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 111243d3a1d3d758196bb71301ccb0f34beb1f5bec7c5c14b15f7c96fd6bdde924e30d90d3ace9e9258074411c9f7e7b4ef6bd9338dc5c11349534b2392f6f81
|
7
|
+
data.tar.gz: 9a9b426c753bd7e6cc12d452d61b90c2422fcad3b3c353a552c5c05a7c7fd53c3d4ac9cec2e33af1537d9e76e04f1df3d6d9b4baf043528fdde2ab4f9f203e9f
|
data/README.md
CHANGED
@@ -17,15 +17,30 @@ You can use datasets easily because you can access each dataset with multiple wa
|
|
17
17
|
|
18
18
|
## Available datasets
|
19
19
|
|
20
|
-
TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
|
21
|
-
|
22
20
|
* Adult Dataset
|
21
|
+
* Aozora Bunko
|
22
|
+
* California Housing
|
23
23
|
* CIFAR-10 Dataset
|
24
24
|
* CIFAR-100 Dataset
|
25
|
+
* CLDR language plural rules
|
26
|
+
* Communities and crime
|
27
|
+
* Diamonds Dataset
|
28
|
+
* E-Stat Japan
|
25
29
|
* Fashion-MNIST
|
30
|
+
* Fuel Economy Dataset
|
31
|
+
* Geolonia Japanese Addresses
|
32
|
+
* Hepatitis
|
26
33
|
* Iris Dataset
|
34
|
+
* Libsvm
|
27
35
|
* MNIST database
|
36
|
+
* Mushroom
|
37
|
+
* Penguins
|
28
38
|
* The Penn Treebank Project
|
39
|
+
* PMJT - Pre-Modern Japanese Text dataset list
|
40
|
+
* Postal Codes in Japan
|
41
|
+
* Rdatasets
|
42
|
+
* Seaborn
|
43
|
+
* Sudachi Synonym Dictionary
|
29
44
|
* Wikipedia
|
30
45
|
* Wine Dataset
|
31
46
|
|
@@ -135,6 +150,12 @@ end
|
|
135
150
|
|
136
151
|
* [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
|
137
152
|
|
153
|
+
## How to develop Red Datasets
|
154
|
+
1. Fork https://github.com/red-data-tools/red-datasets
|
155
|
+
2. Create a feature branch from master
|
156
|
+
3. Develop in the feature branch
|
157
|
+
4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
|
158
|
+
|
138
159
|
## License
|
139
160
|
|
140
161
|
The MIT license. See `LICENSE.txt` for details.
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,97 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.5 - 2022-09-22
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::PMJTDatasetList`: Added.
|
8
|
+
[GitHub#107][Patch by okadak]
|
9
|
+
|
10
|
+
* `Datasets::AozoraBunko`: Added.
|
11
|
+
[GitHub#108][Patch by Masa]
|
12
|
+
|
13
|
+
* Added how to develop to README
|
14
|
+
[GitHub#117][Patch by abcdefg-1234567]
|
15
|
+
|
16
|
+
* `Datasets::FuelEconomy`: Added.
|
17
|
+
[GitHub#114][Patch by Benson Muite]
|
18
|
+
|
19
|
+
* `Datasets::Geolonia`: Added.
|
20
|
+
[GitHub#118][Patch by abcdefg-1234567]
|
21
|
+
|
22
|
+
* `Datasets::Diamonds`: Added.
|
23
|
+
[GitHub#110][Patch by Benson Muite]
|
24
|
+
|
25
|
+
* `Datasets::ITACorpus`: Added.
|
26
|
+
[GitHub#119][Patch by abcdefg-1234567]
|
27
|
+
|
28
|
+
* `Datasets::KuzushijiMNIST`: Added.
|
29
|
+
[GitHub#125][Patch by abcdefg-1234567]
|
30
|
+
|
31
|
+
* Updated list of datasets in README.
|
32
|
+
[GitHub#129][Patch by Benson Muite]
|
33
|
+
|
34
|
+
* `Datasets::CaliforniaHousing`: Added.
|
35
|
+
[GitHub#123][Patch by Benson Muite]
|
36
|
+
|
37
|
+
* Added support for Ruby 3.1.
|
38
|
+
[GitHub#130][Patch by Benson Muite]
|
39
|
+
|
40
|
+
* `Datasets::AFINN`: Added.
|
41
|
+
[GitHub#120][Patch by Benson Muite]
|
42
|
+
|
43
|
+
* `Datasets::LivedoorNews`: Added.
|
44
|
+
[GitHub#127][Patch by abcdefg-1234567]
|
45
|
+
|
46
|
+
* `Datasets::SeabornDataList`: Added.
|
47
|
+
[GitHub#134][Patch by Hirokazu SUZUKI]
|
48
|
+
|
49
|
+
* `Datasets::WikipediaKyotoJapaneseEnglish`: Added.
|
50
|
+
[GitHub#135][Patch by abcdefg-1234567]
|
51
|
+
|
52
|
+
* Renamed Rdatasets to Rdataset.
|
53
|
+
[GitHub#148][Patch by Hirokazu SUZUKI]
|
54
|
+
|
55
|
+
* Removed support for Ruby 2.6.
|
56
|
+
|
57
|
+
* Add missing license information.
|
58
|
+
|
59
|
+
* `Datasets::QuoraDuplicateQuestionPair`: Added.
|
60
|
+
[GitHub#149][Patch by otegami]
|
61
|
+
|
62
|
+
### Fixes
|
63
|
+
|
64
|
+
* Fixed key from nil to :index in `Datasets::SeabornData`.
|
65
|
+
[GitHub#133][Patch by Hirokazu SUZUKI]
|
66
|
+
|
67
|
+
* Fixed `Datasets::Rdatasets#each` to change "NA" to nil.
|
68
|
+
[GitHub#139][Patch by Hirokazu SUZUKI]
|
69
|
+
|
70
|
+
* Fix `Datasets::Rdatasets#each` with mixed data of numeric and string.
|
71
|
+
[GitHub#140][Patch by Hirokazu SUZUKI]
|
72
|
+
|
73
|
+
### Thanks
|
74
|
+
|
75
|
+
* okadak
|
76
|
+
|
77
|
+
* Masa
|
78
|
+
|
79
|
+
* Benson Muite
|
80
|
+
|
81
|
+
* abcdefg-1234567
|
82
|
+
|
83
|
+
* Hirokazu SUZUKI
|
84
|
+
|
85
|
+
* Sutou Kouhei
|
86
|
+
|
87
|
+
* otegami
|
88
|
+
|
89
|
+
## 0.1.4 - 2021-07-13
|
90
|
+
|
91
|
+
### Improvements
|
92
|
+
|
93
|
+
* `Datasets::SudachiSynonymDictionary`: Stopped depending on `LANG`.
|
94
|
+
|
3
95
|
## 0.1.3 - 2021-07-09
|
4
96
|
|
5
97
|
### Improvements
|
data/lib/datasets/adult.rb
CHANGED
@@ -31,7 +31,8 @@ module Datasets
|
|
31
31
|
@type = type
|
32
32
|
@metadata.id = "adult-#{@type}"
|
33
33
|
@metadata.name = "Adult: #{@type}"
|
34
|
-
@metadata.url = "
|
34
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
|
35
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
35
36
|
@metadata.description = lambda do
|
36
37
|
read_names
|
37
38
|
end
|
@@ -58,10 +59,8 @@ module Datasets
|
|
58
59
|
ext = "test"
|
59
60
|
end
|
60
61
|
data_path = cache_dir_path + "adult-#{ext}.csv"
|
61
|
-
|
62
|
-
|
63
|
-
download(data_path, data_url)
|
64
|
-
end
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
|
63
|
+
download(data_path, data_url)
|
65
64
|
|
66
65
|
options = {
|
67
66
|
converters: [:numeric, lambda {|f| f.strip}],
|
@@ -74,10 +73,8 @@ module Datasets
|
|
74
73
|
|
75
74
|
def read_names
|
76
75
|
names_path = cache_dir_path + "adult.names"
|
77
|
-
|
78
|
-
|
79
|
-
download(names_path, names_url)
|
80
|
-
end
|
76
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
|
77
|
+
download(names_path, names_url)
|
81
78
|
names_path.read
|
82
79
|
end
|
83
80
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require "csv"
|
2
|
+
require_relative "zip-extractor"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class AFINN < Dataset
|
6
|
+
Record = Struct.new(:word,
|
7
|
+
:valence)
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super()
|
11
|
+
@metadata.id = "afinn"
|
12
|
+
@metadata.name = "AFINN"
|
13
|
+
@metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
|
14
|
+
@metadata.licenses = ["ODbL-1.0"]
|
15
|
+
@metadata.description = lambda do
|
16
|
+
extract_file("AFINN/AFINN-README.txt") do |input|
|
17
|
+
readme = input.read
|
18
|
+
readme.force_encoding("UTF-8")
|
19
|
+
readme.
|
20
|
+
gsub(/^AFINN-96:.*?\n\n/m, "").
|
21
|
+
gsub(/^In Python.*$/m, "").
|
22
|
+
strip
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
return to_enum(__method__) unless block_given?
|
29
|
+
|
30
|
+
extract_file("AFINN/AFINN-111.txt") do |input|
|
31
|
+
csv = CSV.new(input, col_sep: "\t", converters: :numeric)
|
32
|
+
csv.each do |row|
|
33
|
+
yield(Record.new(*row))
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def extract_file(file_path, &block)
|
40
|
+
data_path = cache_dir_path + "imm6010.zip"
|
41
|
+
data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
|
42
|
+
download(data_path, data_url)
|
43
|
+
|
44
|
+
extractor = ZipExtractor.new(data_path)
|
45
|
+
extractor.extract_file(file_path, &block)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
# Dataset for AozoraBunko
|
6
|
+
class AozoraBunko < Dataset
|
7
|
+
Book = Struct.new(
|
8
|
+
# 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
|
9
|
+
:title_id,
|
10
|
+
:title,
|
11
|
+
:title_reading,
|
12
|
+
:title_reading_collation,
|
13
|
+
:subtitle,
|
14
|
+
:subtitle_reading,
|
15
|
+
:original_title,
|
16
|
+
:first_appearance,
|
17
|
+
:ndc_code, # 分類番号(日本十進分類法の番号)
|
18
|
+
:syllabary_spelling_type,
|
19
|
+
:copyrighted,
|
20
|
+
:published_date,
|
21
|
+
:last_updated_date,
|
22
|
+
:detail_url,
|
23
|
+
# 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
|
24
|
+
:person_id,
|
25
|
+
:person_family_name,
|
26
|
+
:person_first_name,
|
27
|
+
:person_family_name_reading,
|
28
|
+
:person_first_name_reading,
|
29
|
+
:person_family_name_reading_collation,
|
30
|
+
:person_first_name_reading_collation,
|
31
|
+
:person_family_name_romaji,
|
32
|
+
:person_first_name_romaji,
|
33
|
+
:person_type,
|
34
|
+
:person_birthday,
|
35
|
+
:person_date_of_death,
|
36
|
+
:person_copyrighted,
|
37
|
+
# 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
|
38
|
+
:original_book_name1,
|
39
|
+
:original_book_publisher_name1,
|
40
|
+
:original_book_first_published_date1,
|
41
|
+
:used_version_for_registration1,
|
42
|
+
:used_version_for_proofreading1,
|
43
|
+
:base_of_original_book_name1,
|
44
|
+
:base_of_original_book_publisher_name1,
|
45
|
+
:base_of_original_book_first_published_date1,
|
46
|
+
# 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
|
47
|
+
:original_book_name2,
|
48
|
+
:original_book_publisher_name2,
|
49
|
+
:original_book_first_published_date2,
|
50
|
+
:used_version_for_registration2,
|
51
|
+
:used_version_for_proofreading2,
|
52
|
+
:base_of_original_book_name2,
|
53
|
+
:base_of_original_book_publisher_name2,
|
54
|
+
:base_of_original_book_first_published_date2,
|
55
|
+
# 入力者,校正者,
|
56
|
+
:registered_person_name,
|
57
|
+
:proofreader_name,
|
58
|
+
# テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
|
59
|
+
:text_file_url,
|
60
|
+
:last_text_file_updated_date,
|
61
|
+
:text_file_character_encoding,
|
62
|
+
:text_file_character_set,
|
63
|
+
:text_file_updating_count,
|
64
|
+
# XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
|
65
|
+
:html_file_url,
|
66
|
+
:last_html_file_updated_date,
|
67
|
+
:html_file_character_encoding,
|
68
|
+
:html_file_character_set,
|
69
|
+
:html_file_updating_count
|
70
|
+
)
|
71
|
+
|
72
|
+
class Book
|
73
|
+
attr_writer :cache_path
|
74
|
+
|
75
|
+
def initialize(*args)
|
76
|
+
super
|
77
|
+
@text = nil
|
78
|
+
@html = nil
|
79
|
+
@cache_path = nil
|
80
|
+
end
|
81
|
+
|
82
|
+
alias_method :copyrighted?, :copyrighted
|
83
|
+
alias_method :person_copyrighted?, :person_copyrighted
|
84
|
+
|
85
|
+
def text
|
86
|
+
return @text unless @text.nil?
|
87
|
+
return @text if text_file_url.nil? || text_file_url.empty?
|
88
|
+
|
89
|
+
# when url is not zip file, it needs to open web page by brower and has to download
|
90
|
+
# e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
|
91
|
+
return @text unless text_file_url.end_with?('.zip')
|
92
|
+
|
93
|
+
downloader = Downloader.new(text_file_url)
|
94
|
+
downloader.download(text_file_output_path)
|
95
|
+
|
96
|
+
@text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
|
97
|
+
input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
|
98
|
+
end
|
99
|
+
|
100
|
+
@text
|
101
|
+
end
|
102
|
+
|
103
|
+
def html
|
104
|
+
return @html unless @html.nil?
|
105
|
+
return @html if html_file_url.nil? || html_file_url.empty?
|
106
|
+
|
107
|
+
downloader = Downloader.new(html_file_url)
|
108
|
+
downloader.download(html_file_output_path)
|
109
|
+
@html = File.read(html_file_output_path).encode(Encoding::UTF_8,
|
110
|
+
normalize_encoding(html_file_character_encoding))
|
111
|
+
|
112
|
+
@html
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def text_file_output_path
|
118
|
+
cache_base_dir + text_file_name
|
119
|
+
end
|
120
|
+
|
121
|
+
def html_file_output_path
|
122
|
+
cache_base_dir + html_file_name
|
123
|
+
end
|
124
|
+
|
125
|
+
def text_file_name
|
126
|
+
text_file_url.split('/').last
|
127
|
+
end
|
128
|
+
|
129
|
+
def html_file_name
|
130
|
+
html_file_url.split('/').last
|
131
|
+
end
|
132
|
+
|
133
|
+
def cache_base_dir
|
134
|
+
@cache_path.base_dir + title_id + person_id
|
135
|
+
end
|
136
|
+
|
137
|
+
def normalize_encoding(encoding)
|
138
|
+
case encoding
|
139
|
+
when 'ShiftJIS'
|
140
|
+
Encoding::Shift_JIS
|
141
|
+
when 'UTF-8'
|
142
|
+
Encoding::UTF_8
|
143
|
+
else
|
144
|
+
encoding
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def initialize
|
150
|
+
super()
|
151
|
+
|
152
|
+
@metadata.id = 'aozora-bunko'
|
153
|
+
@metadata.name = 'Aozora Bunko'
|
154
|
+
@metadata.url = 'https://www.aozora.gr.jp/'
|
155
|
+
@metadata.licenses = 'CC-BY-2.1-JP'
|
156
|
+
@metadata.description = <<~DESCRIPTION
|
157
|
+
Aozora Bunko is an activity to collect free electronic books that anyone can access
|
158
|
+
on the Internet like a library. The copyrighted works and the works that are said to be
|
159
|
+
"free to read" are available after being digitized in text and XHTML (some HTML) formats.
|
160
|
+
DESCRIPTION
|
161
|
+
end
|
162
|
+
|
163
|
+
def each
|
164
|
+
return to_enum(__method__) unless block_given?
|
165
|
+
|
166
|
+
open_data do |csv_file_stream|
|
167
|
+
text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
|
168
|
+
|
169
|
+
CSV.parse(text, headers: true) do |row|
|
170
|
+
%w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
|
171
|
+
row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
|
172
|
+
end
|
173
|
+
book = Book.new(*row.fields)
|
174
|
+
book.cache_path = cache_path
|
175
|
+
|
176
|
+
yield(book)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
private
|
182
|
+
|
183
|
+
def open_data(&block)
|
184
|
+
data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
|
185
|
+
data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
|
186
|
+
download(data_path, data_url)
|
187
|
+
ZipExtractor.new(data_path).extract_first_file do |input|
|
188
|
+
block.call(input)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def normalize_boolean(column_value)
|
193
|
+
column_value == 'あり'
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datasets
|
2
|
+
class CachePath
|
3
|
+
def initialize(id)
|
4
|
+
@id = id
|
5
|
+
end
|
6
|
+
|
7
|
+
def base_dir
|
8
|
+
Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
|
9
|
+
end
|
10
|
+
|
11
|
+
def remove
|
12
|
+
FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def system_cache_dir
|
18
|
+
case RUBY_PLATFORM
|
19
|
+
when /mswin/, /mingw/
|
20
|
+
ENV['LOCALAPPDATA'] || '~/AppData/Local'
|
21
|
+
when /darwin/
|
22
|
+
'~/Library/Caches'
|
23
|
+
else
|
24
|
+
ENV['XDG_CACHE_HOME'] || '~/.cache'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require "csv"
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class CaliforniaHousing < Dataset
|
6
|
+
Record = Struct.new(:median_house_value,
|
7
|
+
:median_income,
|
8
|
+
:housing_median_age,
|
9
|
+
:total_rooms,
|
10
|
+
:total_bedrooms,
|
11
|
+
:population,
|
12
|
+
:households,
|
13
|
+
:latitude,
|
14
|
+
:longitude)
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
super()
|
18
|
+
@metadata.id = "california-housing"
|
19
|
+
@metadata.name = "California Housing"
|
20
|
+
@metadata.url = "http://lib.stat.cmu.edu/datasets/"
|
21
|
+
@metadata.licenses = ["CCO"]
|
22
|
+
@metadata.description = <<-DESCRIPTION
|
23
|
+
Housing information from the 1990 census used in
|
24
|
+
Pace, R. Kelley and Ronald Barry,
|
25
|
+
"Sparse Spatial Autoregressions",
|
26
|
+
Statistics and Probability Letters, 33 (1997) 291-297.
|
27
|
+
Available from http://lib.stat.cmu.edu/datasets/.
|
28
|
+
DESCRIPTION
|
29
|
+
end
|
30
|
+
|
31
|
+
def each
|
32
|
+
return to_enum(__method__) unless block_given?
|
33
|
+
|
34
|
+
data_path = cache_dir_path + "houses.zip"
|
35
|
+
data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
|
36
|
+
file_name = "cadata.txt"
|
37
|
+
download(data_path, data_url)
|
38
|
+
open_data(data_path, file_name) do |input|
|
39
|
+
data = ""
|
40
|
+
input.each_line do |line|
|
41
|
+
next unless line.start_with?(" ")
|
42
|
+
data << line.lstrip.gsub(/ +/, ",")
|
43
|
+
end
|
44
|
+
options = {
|
45
|
+
converters: [:numeric],
|
46
|
+
}
|
47
|
+
CSV.parse(data, **options) do |row|
|
48
|
+
yield(Record.new(*row))
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
def open_data(data_path, file_name)
|
55
|
+
ZipExtractor.new(data_path).extract_first_file do |input|
|
56
|
+
yield input
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/datasets/cifar.rb
CHANGED
@@ -50,10 +50,8 @@ module Datasets
|
|
50
50
|
return to_enum(__method__) unless block_given?
|
51
51
|
|
52
52
|
data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
|
53
|
-
|
54
|
-
|
55
|
-
download(data_path, data_url)
|
56
|
-
end
|
53
|
+
data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
|
54
|
+
download(data_path, data_url)
|
57
55
|
|
58
56
|
parse_data(data_path, &block)
|
59
57
|
end
|
@@ -42,10 +42,8 @@ module Datasets
|
|
42
42
|
private
|
43
43
|
def open_data
|
44
44
|
data_path = cache_dir_path + "plurals.xml"
|
45
|
-
|
46
|
-
|
47
|
-
end
|
48
|
-
::File.open(data_path) do |input|
|
45
|
+
download(data_path, @metadata.url)
|
46
|
+
data_path.open do |input|
|
49
47
|
yield(input)
|
50
48
|
end
|
51
49
|
end
|
data/lib/datasets/communities.rb
CHANGED
@@ -140,6 +140,7 @@ module Datasets
|
|
140
140
|
@metadata.id = "communities"
|
141
141
|
@metadata.name = "Communities"
|
142
142
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
|
143
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
143
144
|
@metadata.description = lambda do
|
144
145
|
read_names
|
145
146
|
end
|
@@ -177,10 +178,8 @@ module Datasets
|
|
177
178
|
|
178
179
|
def open_data
|
179
180
|
data_path = cache_dir_path + "communities.data"
|
180
|
-
|
181
|
-
|
182
|
-
download(data_path, data_url)
|
183
|
-
end
|
181
|
+
data_url = "#{base_url}/communities.data"
|
182
|
+
download(data_path, data_url)
|
184
183
|
CSV.open(data_path) do |csv|
|
185
184
|
yield(csv)
|
186
185
|
end
|
@@ -188,10 +187,8 @@ module Datasets
|
|
188
187
|
|
189
188
|
def read_names
|
190
189
|
names_path = cache_dir_path + "communities.names"
|
191
|
-
|
192
|
-
|
193
|
-
download(names_path, names_url)
|
194
|
-
end
|
190
|
+
names_url = "#{base_url}/communities.names"
|
191
|
+
download(names_path, names_url)
|
195
192
|
names_path.read
|
196
193
|
end
|
197
194
|
end
|
data/lib/datasets/dataset.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "pathname"
|
2
2
|
|
3
|
+
require_relative "cache-path"
|
3
4
|
require_relative "downloader"
|
4
5
|
require_relative "error"
|
5
6
|
require_relative "metadata"
|
@@ -19,22 +20,17 @@ module Datasets
|
|
19
20
|
end
|
20
21
|
|
21
22
|
def clear_cache!
|
22
|
-
|
23
|
-
FileUtils.rmtree(cache_dir_path.to_s, secure: true)
|
24
|
-
end
|
23
|
+
cache_path.remove
|
25
24
|
end
|
26
25
|
|
27
26
|
private
|
27
|
+
|
28
28
|
def cache_dir_path
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
else
|
35
|
-
base_dir = ENV["XDG_CACHE_HOME"] || "~/.cache"
|
36
|
-
end
|
37
|
-
Pathname(base_dir).expand_path + "red-datasets" + metadata.id
|
29
|
+
cache_path.base_dir
|
30
|
+
end
|
31
|
+
|
32
|
+
def cache_path
|
33
|
+
@cache_path ||= CachePath.new(@metadata.id)
|
38
34
|
end
|
39
35
|
|
40
36
|
def download(output_path, url)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative "ggplot2-dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class Diamonds < Ggplot2Dataset
|
5
|
+
Record = Struct.new(:carat,
|
6
|
+
:cut,
|
7
|
+
:color,
|
8
|
+
:clarity,
|
9
|
+
:depth,
|
10
|
+
:table,
|
11
|
+
:price,
|
12
|
+
:x,
|
13
|
+
:y,
|
14
|
+
:z)
|
15
|
+
|
16
|
+
def initialize()
|
17
|
+
super("diamonds")
|
18
|
+
@metadata.id = "diamonds"
|
19
|
+
@metadata.name = "Diamonds"
|
20
|
+
@metadata.licenses = ["CC0-1.0"]
|
21
|
+
end
|
22
|
+
|
23
|
+
COLUMN_NAME_MAPPING = {
|
24
|
+
}
|
25
|
+
end
|
26
|
+
end
|
data/lib/datasets/downloader.rb
CHANGED
@@ -23,9 +23,14 @@ module Datasets
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def download(output_path)
|
26
|
+
return if output_path.exist?
|
27
|
+
|
26
28
|
output_path.parent.mkpath
|
27
29
|
|
28
|
-
headers = {
|
30
|
+
headers = {
|
31
|
+
"Accept-Encoding" => "identity",
|
32
|
+
"User-Agent" => "Red Datasets/#{VERSION}",
|
33
|
+
}
|
29
34
|
start = nil
|
30
35
|
partial_output_path = Pathname.new("#{output_path}.partial")
|
31
36
|
if partial_output_path.exist?
|
@@ -74,6 +74,7 @@ module Datasets
|
|
74
74
|
@metadata.id = "e-stat-japan-#{@api_version}"
|
75
75
|
@metadata.name = "e-Stat API #{@api_version}"
|
76
76
|
@metadata.url = @base_url
|
77
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
77
78
|
@metadata.description = "e-Stat API #{@api_version}"
|
78
79
|
|
79
80
|
@id = id
|
@@ -214,7 +215,7 @@ module Datasets
|
|
214
215
|
# even if error happens dispite of its error mapping.
|
215
216
|
# So we can't avoid caching retrieved response from the api.
|
216
217
|
# ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
|
217
|
-
download(@data_path, @url.to_s)
|
218
|
+
download(@data_path, @url.to_s)
|
218
219
|
end
|
219
220
|
|
220
221
|
def index_data
|