red-datasets 0.1.4 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -3
- data/Rakefile +56 -1
- data/doc/text/news.md +102 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +58 -23
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +110 -30
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +6 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +16 -8
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +48 -0
- data/lib/datasets.rb +2 -22
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +65 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- data/test/test-wikipedia.rb +25 -71
- metadata +62 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
|
4
|
+
data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
|
7
|
+
data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252
|
data/README.md
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# Red Datasets
|
2
2
|
|
3
|
-
[![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
|
4
3
|
[![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
|
5
4
|
|
6
5
|
## Description
|
@@ -17,15 +16,30 @@ You can use datasets easily because you can access each dataset with multiple wa
|
|
17
16
|
|
18
17
|
## Available datasets
|
19
18
|
|
20
|
-
TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
|
21
|
-
|
22
19
|
* Adult Dataset
|
20
|
+
* Aozora Bunko
|
21
|
+
* California Housing
|
23
22
|
* CIFAR-10 Dataset
|
24
23
|
* CIFAR-100 Dataset
|
24
|
+
* CLDR language plural rules
|
25
|
+
* Communities and crime
|
26
|
+
* Diamonds Dataset
|
27
|
+
* E-Stat Japan
|
25
28
|
* Fashion-MNIST
|
29
|
+
* Fuel Economy Dataset
|
30
|
+
* Geolonia Japanese Addresses
|
31
|
+
* Hepatitis
|
26
32
|
* Iris Dataset
|
33
|
+
* Libsvm
|
27
34
|
* MNIST database
|
35
|
+
* Mushroom
|
36
|
+
* Penguins
|
28
37
|
* The Penn Treebank Project
|
38
|
+
* PMJT - Pre-Modern Japanese Text dataset list
|
39
|
+
* Postal Codes in Japan
|
40
|
+
* Rdatasets
|
41
|
+
* Seaborn
|
42
|
+
* Sudachi Synonym Dictionary
|
29
43
|
* Wikipedia
|
30
44
|
* Wine Dataset
|
31
45
|
|
@@ -135,6 +149,12 @@ end
|
|
135
149
|
|
136
150
|
* [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
|
137
151
|
|
152
|
+
## How to develop Red Datasets
|
153
|
+
1. Fork https://github.com/red-data-tools/red-datasets
|
154
|
+
2. Create a feature branch from master
|
155
|
+
3. Develop in the feature branch
|
156
|
+
4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
|
157
|
+
|
138
158
|
## License
|
139
159
|
|
140
160
|
The MIT license. See `LICENSE.txt` for details.
|
data/Rakefile
CHANGED
@@ -13,9 +13,64 @@ end
|
|
13
13
|
helper.install
|
14
14
|
spec = helper.gemspec
|
15
15
|
|
16
|
+
task default: :test
|
17
|
+
|
16
18
|
desc "Run tests"
|
17
19
|
task :test do
|
18
20
|
ruby("test/run-test.rb")
|
19
21
|
end
|
20
22
|
|
21
|
-
|
23
|
+
desc "Generate an artifact for GitHub Pages"
|
24
|
+
task :pages do
|
25
|
+
pages_dir = "_site"
|
26
|
+
rm_rf(pages_dir)
|
27
|
+
mkdir_p(pages_dir)
|
28
|
+
|
29
|
+
require "cgi/util"
|
30
|
+
require_relative "lib/datasets/lazy"
|
31
|
+
File.open("#{pages_dir}/index.html", "w") do |index_html|
|
32
|
+
index_html.puts(<<-HTML)
|
33
|
+
<!DOCTYPE html>
|
34
|
+
<html>
|
35
|
+
<head>
|
36
|
+
<meta charset="UTF-8">
|
37
|
+
<title>Red Datasets</title>
|
38
|
+
<style>
|
39
|
+
table {
|
40
|
+
margin-left: 20vw;
|
41
|
+
min-width: 50%;
|
42
|
+
}
|
43
|
+
th {
|
44
|
+
font-size: 30px;
|
45
|
+
padding: 20px;
|
46
|
+
}
|
47
|
+
td {
|
48
|
+
border-bottom: 1px solid #D9DCE0;
|
49
|
+
padding: 20px;
|
50
|
+
font-weight: bold;
|
51
|
+
}
|
52
|
+
</style>
|
53
|
+
</head>
|
54
|
+
<body>
|
55
|
+
<section>
|
56
|
+
<h1>Red Datasets</h1>
|
57
|
+
<table>
|
58
|
+
<thead>
|
59
|
+
<tr><th>Available datasets</th></tr>
|
60
|
+
</thead>
|
61
|
+
<tbody>
|
62
|
+
HTML
|
63
|
+
Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
|
64
|
+
index_html.puts(<<-HTML)
|
65
|
+
<tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
|
66
|
+
HTML
|
67
|
+
end
|
68
|
+
index_html.puts(<<-HTML)
|
69
|
+
</tbody>
|
70
|
+
</table>
|
71
|
+
</section>
|
72
|
+
</body>
|
73
|
+
</html>
|
74
|
+
HTML
|
75
|
+
end
|
76
|
+
end
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,107 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.6 - 2023-05-24
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Added support for lazy loading by `require "datasets/lazy"`.
|
8
|
+
|
9
|
+
* `Datasets::NagoyaUniversityConversationCorpus`: Added.
|
10
|
+
[GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
|
11
|
+
[Patch by matsuura]
|
12
|
+
|
13
|
+
* `Datasets::Wikipedia`: Added support for downloading in background.
|
14
|
+
|
15
|
+
### Thanks
|
16
|
+
|
17
|
+
* matsuura
|
18
|
+
|
19
|
+
## 0.1.5 - 2022-09-22
|
20
|
+
|
21
|
+
### Improvements
|
22
|
+
|
23
|
+
* `Datasets::PMJTDatasetList`: Added.
|
24
|
+
[GitHub#107][Patch by okadak]
|
25
|
+
|
26
|
+
* `Datasets::AozoraBunko`: Added.
|
27
|
+
[GitHub#108][Patch by Masa]
|
28
|
+
|
29
|
+
* Added how to develop to README
|
30
|
+
[GitHub#117][Patch by abcdefg-1234567]
|
31
|
+
|
32
|
+
* `Datasets::FuelEconomy`: Added.
|
33
|
+
[GitHub#114][Patch by Benson Muite]
|
34
|
+
|
35
|
+
* `Datasets::Geolonia`: Added.
|
36
|
+
[GitHub#118][Patch by abcdefg-1234567]
|
37
|
+
|
38
|
+
* `Datasets::Diamonds`: Added.
|
39
|
+
[GitHub#110][Patch by Benson Muite]
|
40
|
+
|
41
|
+
* `Datasets::ITACorpus`: Added.
|
42
|
+
[GitHub#119][Patch by abcdefg-1234567]
|
43
|
+
|
44
|
+
* `Datasets::KuzushijiMNIST`: Added.
|
45
|
+
[GitHub#125][Patch by abcdefg-1234567]
|
46
|
+
|
47
|
+
* Updated list of datasets in README.
|
48
|
+
[GitHub#129][Patch by Benson Muite]
|
49
|
+
|
50
|
+
* `Datasets::CaliforniaHousing`: Added.
|
51
|
+
[GitHub#123][Patch by Benson Muite]
|
52
|
+
|
53
|
+
* Added support for Ruby 3.1.
|
54
|
+
[GitHub#130][Patch by Benson Muite]
|
55
|
+
|
56
|
+
* `Datasets::AFINN`: Added.
|
57
|
+
[GitHub#120][Patch by Benson Muite]
|
58
|
+
|
59
|
+
* `Datasets::LivedoorNews`: Added.
|
60
|
+
[GitHub#127][Patch by abcdefg-1234567]
|
61
|
+
|
62
|
+
* `Datasets::SeabornDataList`: Added.
|
63
|
+
[GitHub#134][Patch by Hirokazu SUZUKI]
|
64
|
+
|
65
|
+
* `Datasets::WikipediaKyotoJapaneseEnglish`: Added.
|
66
|
+
[GitHub#135][Patch by abcdefg-1234567]
|
67
|
+
|
68
|
+
* Renamed Rdatasets to Rdataset.
|
69
|
+
[GitHub#148][Patch by Hirokazu SUZUKI]
|
70
|
+
|
71
|
+
* Removed support for Ruby 2.6.
|
72
|
+
|
73
|
+
* Add missing license information.
|
74
|
+
|
75
|
+
* `Datasets::QuoraDuplicateQuestionPair`: Added.
|
76
|
+
[GitHub#149][Patch by otegami]
|
77
|
+
|
78
|
+
### Fixes
|
79
|
+
|
80
|
+
* Fixed key from nil to :index in `Datasets::SeabornData`.
|
81
|
+
[GitHub#133][Patch by Hirokazu SUZUKI]
|
82
|
+
|
83
|
+
* Fixed `Datasets::Rdatasets#each` to change "NA" to nil.
|
84
|
+
[GitHub#139][Patch by Hirokazu SUZUKI]
|
85
|
+
|
86
|
+
* Fix `Datasets::Rdatasets#each` with mixed data of numeric and string.
|
87
|
+
[GitHub#140][Patch by Hirokazu SUZUKI]
|
88
|
+
|
89
|
+
### Thanks
|
90
|
+
|
91
|
+
* okadak
|
92
|
+
|
93
|
+
* Masa
|
94
|
+
|
95
|
+
* Benson Muite
|
96
|
+
|
97
|
+
* abcdefg-1234567
|
98
|
+
|
99
|
+
* Hirokazu SUZUKI
|
100
|
+
|
101
|
+
* Sutou Kouhei
|
102
|
+
|
103
|
+
* otegami
|
104
|
+
|
3
105
|
## 0.1.4 - 2021-07-13
|
4
106
|
|
5
107
|
### Improvements
|
data/lib/datasets/adult.rb
CHANGED
@@ -31,7 +31,8 @@ module Datasets
|
|
31
31
|
@type = type
|
32
32
|
@metadata.id = "adult-#{@type}"
|
33
33
|
@metadata.name = "Adult: #{@type}"
|
34
|
-
@metadata.url = "
|
34
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
|
35
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
35
36
|
@metadata.description = lambda do
|
36
37
|
read_names
|
37
38
|
end
|
@@ -58,10 +59,8 @@ module Datasets
|
|
58
59
|
ext = "test"
|
59
60
|
end
|
60
61
|
data_path = cache_dir_path + "adult-#{ext}.csv"
|
61
|
-
|
62
|
-
|
63
|
-
download(data_path, data_url)
|
64
|
-
end
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
|
63
|
+
download(data_path, data_url)
|
65
64
|
|
66
65
|
options = {
|
67
66
|
converters: [:numeric, lambda {|f| f.strip}],
|
@@ -74,10 +73,8 @@ module Datasets
|
|
74
73
|
|
75
74
|
def read_names
|
76
75
|
names_path = cache_dir_path + "adult.names"
|
77
|
-
|
78
|
-
|
79
|
-
download(names_path, names_url)
|
80
|
-
end
|
76
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
|
77
|
+
download(names_path, names_url)
|
81
78
|
names_path.read
|
82
79
|
end
|
83
80
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require "csv"
|
2
|
+
require_relative "zip-extractor"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class AFINN < Dataset
|
6
|
+
Record = Struct.new(:word,
|
7
|
+
:valence)
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super()
|
11
|
+
@metadata.id = "afinn"
|
12
|
+
@metadata.name = "AFINN"
|
13
|
+
@metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
|
14
|
+
@metadata.licenses = ["ODbL-1.0"]
|
15
|
+
@metadata.description = lambda do
|
16
|
+
extract_file("AFINN/AFINN-README.txt") do |input|
|
17
|
+
readme = input.read
|
18
|
+
readme.force_encoding("UTF-8")
|
19
|
+
readme.
|
20
|
+
gsub(/^AFINN-96:.*?\n\n/m, "").
|
21
|
+
gsub(/^In Python.*$/m, "").
|
22
|
+
strip
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
return to_enum(__method__) unless block_given?
|
29
|
+
|
30
|
+
extract_file("AFINN/AFINN-111.txt") do |input|
|
31
|
+
csv = CSV.new(input, col_sep: "\t", converters: :numeric)
|
32
|
+
csv.each do |row|
|
33
|
+
yield(Record.new(*row))
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def extract_file(file_path, &block)
|
40
|
+
data_path = cache_dir_path + "imm6010.zip"
|
41
|
+
data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
|
42
|
+
download(data_path, data_url)
|
43
|
+
|
44
|
+
extractor = ZipExtractor.new(data_path)
|
45
|
+
extractor.extract_file(file_path, &block)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
# Dataset for AozoraBunko
|
6
|
+
class AozoraBunko < Dataset
|
7
|
+
Book = Struct.new(
|
8
|
+
# 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
|
9
|
+
:title_id,
|
10
|
+
:title,
|
11
|
+
:title_reading,
|
12
|
+
:title_reading_collation,
|
13
|
+
:subtitle,
|
14
|
+
:subtitle_reading,
|
15
|
+
:original_title,
|
16
|
+
:first_appearance,
|
17
|
+
:ndc_code, # 分類番号(日本十進分類法の番号)
|
18
|
+
:syllabary_spelling_type,
|
19
|
+
:copyrighted,
|
20
|
+
:published_date,
|
21
|
+
:last_updated_date,
|
22
|
+
:detail_url,
|
23
|
+
# 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
|
24
|
+
:person_id,
|
25
|
+
:person_family_name,
|
26
|
+
:person_first_name,
|
27
|
+
:person_family_name_reading,
|
28
|
+
:person_first_name_reading,
|
29
|
+
:person_family_name_reading_collation,
|
30
|
+
:person_first_name_reading_collation,
|
31
|
+
:person_family_name_romaji,
|
32
|
+
:person_first_name_romaji,
|
33
|
+
:person_type,
|
34
|
+
:person_birthday,
|
35
|
+
:person_date_of_death,
|
36
|
+
:person_copyrighted,
|
37
|
+
# 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
|
38
|
+
:original_book_name1,
|
39
|
+
:original_book_publisher_name1,
|
40
|
+
:original_book_first_published_date1,
|
41
|
+
:used_version_for_registration1,
|
42
|
+
:used_version_for_proofreading1,
|
43
|
+
:base_of_original_book_name1,
|
44
|
+
:base_of_original_book_publisher_name1,
|
45
|
+
:base_of_original_book_first_published_date1,
|
46
|
+
# 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
|
47
|
+
:original_book_name2,
|
48
|
+
:original_book_publisher_name2,
|
49
|
+
:original_book_first_published_date2,
|
50
|
+
:used_version_for_registration2,
|
51
|
+
:used_version_for_proofreading2,
|
52
|
+
:base_of_original_book_name2,
|
53
|
+
:base_of_original_book_publisher_name2,
|
54
|
+
:base_of_original_book_first_published_date2,
|
55
|
+
# 入力者,校正者,
|
56
|
+
:registered_person_name,
|
57
|
+
:proofreader_name,
|
58
|
+
# テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
|
59
|
+
:text_file_url,
|
60
|
+
:last_text_file_updated_date,
|
61
|
+
:text_file_character_encoding,
|
62
|
+
:text_file_character_set,
|
63
|
+
:text_file_updating_count,
|
64
|
+
# XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
|
65
|
+
:html_file_url,
|
66
|
+
:last_html_file_updated_date,
|
67
|
+
:html_file_character_encoding,
|
68
|
+
:html_file_character_set,
|
69
|
+
:html_file_updating_count
|
70
|
+
)
|
71
|
+
|
72
|
+
class Book
|
73
|
+
attr_writer :cache_path
|
74
|
+
|
75
|
+
def initialize(*args)
|
76
|
+
super
|
77
|
+
@text = nil
|
78
|
+
@html = nil
|
79
|
+
@cache_path = nil
|
80
|
+
end
|
81
|
+
|
82
|
+
alias_method :copyrighted?, :copyrighted
|
83
|
+
alias_method :person_copyrighted?, :person_copyrighted
|
84
|
+
|
85
|
+
def text
|
86
|
+
return @text unless @text.nil?
|
87
|
+
return @text if text_file_url.nil? || text_file_url.empty?
|
88
|
+
|
89
|
+
# when url is not zip file, it needs to open web page by brower and has to download
|
90
|
+
# e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
|
91
|
+
return @text unless text_file_url.end_with?('.zip')
|
92
|
+
|
93
|
+
downloader = Downloader.new(text_file_url)
|
94
|
+
downloader.download(text_file_output_path)
|
95
|
+
|
96
|
+
@text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
|
97
|
+
input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
|
98
|
+
end
|
99
|
+
|
100
|
+
@text
|
101
|
+
end
|
102
|
+
|
103
|
+
def html
|
104
|
+
return @html unless @html.nil?
|
105
|
+
return @html if html_file_url.nil? || html_file_url.empty?
|
106
|
+
|
107
|
+
downloader = Downloader.new(html_file_url)
|
108
|
+
downloader.download(html_file_output_path)
|
109
|
+
@html = File.read(html_file_output_path).encode(Encoding::UTF_8,
|
110
|
+
normalize_encoding(html_file_character_encoding))
|
111
|
+
|
112
|
+
@html
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def text_file_output_path
|
118
|
+
cache_base_dir + text_file_name
|
119
|
+
end
|
120
|
+
|
121
|
+
def html_file_output_path
|
122
|
+
cache_base_dir + html_file_name
|
123
|
+
end
|
124
|
+
|
125
|
+
def text_file_name
|
126
|
+
text_file_url.split('/').last
|
127
|
+
end
|
128
|
+
|
129
|
+
def html_file_name
|
130
|
+
html_file_url.split('/').last
|
131
|
+
end
|
132
|
+
|
133
|
+
def cache_base_dir
|
134
|
+
@cache_path.base_dir + title_id + person_id
|
135
|
+
end
|
136
|
+
|
137
|
+
def normalize_encoding(encoding)
|
138
|
+
case encoding
|
139
|
+
when 'ShiftJIS'
|
140
|
+
Encoding::Shift_JIS
|
141
|
+
when 'UTF-8'
|
142
|
+
Encoding::UTF_8
|
143
|
+
else
|
144
|
+
encoding
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def initialize
|
150
|
+
super()
|
151
|
+
|
152
|
+
@metadata.id = 'aozora-bunko'
|
153
|
+
@metadata.name = 'Aozora Bunko'
|
154
|
+
@metadata.url = 'https://www.aozora.gr.jp/'
|
155
|
+
@metadata.licenses = 'CC-BY-2.1-JP'
|
156
|
+
@metadata.description = <<~DESCRIPTION
|
157
|
+
Aozora Bunko is an activity to collect free electronic books that anyone can access
|
158
|
+
on the Internet like a library. The copyrighted works and the works that are said to be
|
159
|
+
"free to read" are available after being digitized in text and XHTML (some HTML) formats.
|
160
|
+
DESCRIPTION
|
161
|
+
end
|
162
|
+
|
163
|
+
def each
|
164
|
+
return to_enum(__method__) unless block_given?
|
165
|
+
|
166
|
+
open_data do |csv_file_stream|
|
167
|
+
text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
|
168
|
+
|
169
|
+
CSV.parse(text, headers: true) do |row|
|
170
|
+
%w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
|
171
|
+
row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
|
172
|
+
end
|
173
|
+
book = Book.new(*row.fields)
|
174
|
+
book.cache_path = cache_path
|
175
|
+
|
176
|
+
yield(book)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
private
|
182
|
+
|
183
|
+
def open_data(&block)
|
184
|
+
data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
|
185
|
+
data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
|
186
|
+
download(data_path, data_url)
|
187
|
+
ZipExtractor.new(data_path).extract_first_file do |input|
|
188
|
+
block.call(input)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def normalize_boolean(column_value)
|
193
|
+
column_value == 'あり'
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Datasets
|
2
|
+
class CachePath
|
3
|
+
def initialize(id)
|
4
|
+
@id = id
|
5
|
+
end
|
6
|
+
|
7
|
+
def base_dir
|
8
|
+
Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
|
9
|
+
end
|
10
|
+
|
11
|
+
def remove
|
12
|
+
FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def system_cache_dir
|
18
|
+
case RUBY_PLATFORM
|
19
|
+
when /mswin/, /mingw/
|
20
|
+
ENV['LOCALAPPDATA'] || '~/AppData/Local'
|
21
|
+
when /darwin/
|
22
|
+
'~/Library/Caches'
|
23
|
+
else
|
24
|
+
ENV['XDG_CACHE_HOME'] || '~/.cache'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require "csv"
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class CaliforniaHousing < Dataset
|
6
|
+
Record = Struct.new(:median_house_value,
|
7
|
+
:median_income,
|
8
|
+
:housing_median_age,
|
9
|
+
:total_rooms,
|
10
|
+
:total_bedrooms,
|
11
|
+
:population,
|
12
|
+
:households,
|
13
|
+
:latitude,
|
14
|
+
:longitude)
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
super()
|
18
|
+
@metadata.id = "california-housing"
|
19
|
+
@metadata.name = "California Housing"
|
20
|
+
@metadata.url = "http://lib.stat.cmu.edu/datasets/"
|
21
|
+
@metadata.licenses = ["CCO"]
|
22
|
+
@metadata.description = <<-DESCRIPTION
|
23
|
+
Housing information from the 1990 census used in
|
24
|
+
Pace, R. Kelley and Ronald Barry,
|
25
|
+
"Sparse Spatial Autoregressions",
|
26
|
+
Statistics and Probability Letters, 33 (1997) 291-297.
|
27
|
+
Available from http://lib.stat.cmu.edu/datasets/.
|
28
|
+
DESCRIPTION
|
29
|
+
end
|
30
|
+
|
31
|
+
def each
|
32
|
+
return to_enum(__method__) unless block_given?
|
33
|
+
|
34
|
+
data_path = cache_dir_path + "houses.zip"
|
35
|
+
data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
|
36
|
+
file_name = "cadata.txt"
|
37
|
+
download(data_path, data_url)
|
38
|
+
open_data(data_path, file_name) do |input|
|
39
|
+
data = ""
|
40
|
+
input.each_line do |line|
|
41
|
+
next unless line.start_with?(" ")
|
42
|
+
data << line.lstrip.gsub(/ +/, ",")
|
43
|
+
end
|
44
|
+
options = {
|
45
|
+
converters: [:numeric],
|
46
|
+
}
|
47
|
+
CSV.parse(data, **options) do |row|
|
48
|
+
yield(Record.new(*row))
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
def open_data(data_path, file_name)
|
55
|
+
ZipExtractor.new(data_path).extract_first_file do |input|
|
56
|
+
yield input
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/datasets/cifar.rb
CHANGED
@@ -50,10 +50,8 @@ module Datasets
|
|
50
50
|
return to_enum(__method__) unless block_given?
|
51
51
|
|
52
52
|
data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
|
53
|
-
|
54
|
-
|
55
|
-
download(data_path, data_url)
|
56
|
-
end
|
53
|
+
data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
|
54
|
+
download(data_path, data_url)
|
57
55
|
|
58
56
|
parse_data(data_path, &block)
|
59
57
|
end
|
@@ -42,10 +42,8 @@ module Datasets
|
|
42
42
|
private
|
43
43
|
def open_data
|
44
44
|
data_path = cache_dir_path + "plurals.xml"
|
45
|
-
|
46
|
-
|
47
|
-
end
|
48
|
-
::File.open(data_path) do |input|
|
45
|
+
download(data_path, @metadata.url)
|
46
|
+
data_path.open do |input|
|
49
47
|
yield(input)
|
50
48
|
end
|
51
49
|
end
|
data/lib/datasets/communities.rb
CHANGED
@@ -140,6 +140,7 @@ module Datasets
|
|
140
140
|
@metadata.id = "communities"
|
141
141
|
@metadata.name = "Communities"
|
142
142
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
|
143
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
143
144
|
@metadata.description = lambda do
|
144
145
|
read_names
|
145
146
|
end
|
@@ -177,10 +178,8 @@ module Datasets
|
|
177
178
|
|
178
179
|
def open_data
|
179
180
|
data_path = cache_dir_path + "communities.data"
|
180
|
-
|
181
|
-
|
182
|
-
download(data_path, data_url)
|
183
|
-
end
|
181
|
+
data_url = "#{base_url}/communities.data"
|
182
|
+
download(data_path, data_url)
|
184
183
|
CSV.open(data_path) do |csv|
|
185
184
|
yield(csv)
|
186
185
|
end
|
@@ -188,10 +187,8 @@ module Datasets
|
|
188
187
|
|
189
188
|
def read_names
|
190
189
|
names_path = cache_dir_path + "communities.names"
|
191
|
-
|
192
|
-
|
193
|
-
download(names_path, names_url)
|
194
|
-
end
|
190
|
+
names_url = "#{base_url}/communities.names"
|
191
|
+
download(names_path, names_url)
|
195
192
|
names_path.read
|
196
193
|
end
|
197
194
|
end
|