red-datasets 0.1.4 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8d18fa976f1b368a6a3f9cc85dc7a58a1785fd02901157672484f2a7d8b1fa88
4
- data.tar.gz: c91d651a0d8de6722ee759ce29545f5f382d1e9f060c7e4ee5a0fcd557be4d21
3
+ metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
4
+ data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
5
5
  SHA512:
6
- metadata.gz: 0ff7694dd27e4293206de81fc2a7b5ccccb886579ed73eb7f97d390472692ce310993e2ece741cf85f5fbe265f1deb2a7ea326590383b4bdf0d3f77f10b1bbc1
7
- data.tar.gz: 38ac6aa12d3e33ab0c26c0750273b60386d90fd4d916776a0d561c3f25a79fa2d7d216ac465842207cd65f62e2fcbd348389e65f905583187fe23c30908d92dc
6
+ metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
7
+ data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252
data/README.md CHANGED
@@ -1,6 +1,5 @@
1
1
  # Red Datasets
2
2
 
3
- [![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
4
3
  [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
5
4
 
6
5
  ## Description
@@ -17,15 +16,30 @@ You can use datasets easily because you can access each dataset with multiple wa
17
16
 
18
17
  ## Available datasets
19
18
 
20
- TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
21
-
22
19
  * Adult Dataset
20
+ * Aozora Bunko
21
+ * California Housing
23
22
  * CIFAR-10 Dataset
24
23
  * CIFAR-100 Dataset
24
+ * CLDR language plural rules
25
+ * Communities and crime
26
+ * Diamonds Dataset
27
+ * E-Stat Japan
25
28
  * Fashion-MNIST
29
+ * Fuel Economy Dataset
30
+ * Geolonia Japanese Addresses
31
+ * Hepatitis
26
32
  * Iris Dataset
33
+ * Libsvm
27
34
  * MNIST database
35
+ * Mushroom
36
+ * Penguins
28
37
  * The Penn Treebank Project
38
+ * PMJT - Pre-Modern Japanese Text dataset list
39
+ * Postal Codes in Japan
40
+ * Rdatasets
41
+ * Seaborn
42
+ * Sudachi Synonym Dictionary
29
43
  * Wikipedia
30
44
  * Wine Dataset
31
45
 
@@ -135,6 +149,12 @@ end
135
149
 
136
150
  * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
137
151
 
152
+ ## How to develop Red Datasets
153
+ 1. Fork https://github.com/red-data-tools/red-datasets
154
+ 2. Create a feature branch from master
155
+ 3. Develop in the feature branch
156
+ 4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
157
+
138
158
  ## License
139
159
 
140
160
  The MIT license. See `LICENSE.txt` for details.
data/Rakefile CHANGED
@@ -13,9 +13,64 @@ end
13
13
  helper.install
14
14
  spec = helper.gemspec
15
15
 
16
+ task default: :test
17
+
16
18
  desc "Run tests"
17
19
  task :test do
18
20
  ruby("test/run-test.rb")
19
21
  end
20
22
 
21
- task default: :test
23
+ desc "Generate an artifact for GitHub Pages"
24
+ task :pages do
25
+ pages_dir = "_site"
26
+ rm_rf(pages_dir)
27
+ mkdir_p(pages_dir)
28
+
29
+ require "cgi/util"
30
+ require_relative "lib/datasets/lazy"
31
+ File.open("#{pages_dir}/index.html", "w") do |index_html|
32
+ index_html.puts(<<-HTML)
33
+ <!DOCTYPE html>
34
+ <html>
35
+ <head>
36
+ <meta charset="UTF-8">
37
+ <title>Red Datasets</title>
38
+ <style>
39
+ table {
40
+ margin-left: 20vw;
41
+ min-width: 50%;
42
+ }
43
+ th {
44
+ font-size: 30px;
45
+ padding: 20px;
46
+ }
47
+ td {
48
+ border-bottom: 1px solid #D9DCE0;
49
+ padding: 20px;
50
+ font-weight: bold;
51
+ }
52
+ </style>
53
+ </head>
54
+ <body>
55
+ <section>
56
+ <h1>Red Datasets</h1>
57
+ <table>
58
+ <thead>
59
+ <tr><th>Available datasets</th></tr>
60
+ </thead>
61
+ <tbody>
62
+ HTML
63
+ Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
64
+ index_html.puts(<<-HTML)
65
+ <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
66
+ HTML
67
+ end
68
+ index_html.puts(<<-HTML)
69
+ </tbody>
70
+ </table>
71
+ </section>
72
+ </body>
73
+ </html>
74
+ HTML
75
+ end
76
+ end
data/doc/text/news.md CHANGED
@@ -1,5 +1,107 @@
1
1
  # News
2
2
 
3
+ ## 0.1.6 - 2023-05-24
4
+
5
+ ### Improvements
6
+
7
+ * Added support for lazy loading by `require "datasets/lazy"`.
8
+
9
+ * `Datasets::NagoyaUniversityConversationCorpus`: Added.
10
+ [GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
11
+ [Patch by matsuura]
12
+
13
+ * `Datasets::Wikipedia`: Added support for downloading in background.
14
+
15
+ ### Thanks
16
+
17
+ * matsuura
18
+
19
+ ## 0.1.5 - 2022-09-22
20
+
21
+ ### Improvements
22
+
23
+ * `Datasets::PMJTDatasetList`: Added.
24
+ [GitHub#107][Patch by okadak]
25
+
26
+ * `Datasets::AozoraBunko`: Added.
27
+ [GitHub#108][Patch by Masa]
28
+
29
+ * Added how to develop to README
30
+ [GitHub#117][Patch by abcdefg-1234567]
31
+
32
+ * `Datasets::FuelEconomy`: Added.
33
+ [GitHub#114][Patch by Benson Muite]
34
+
35
+ * `Datasets::Geolonia`: Added.
36
+ [GitHub#118][Patch by abcdefg-1234567]
37
+
38
+ * `Datasets::Diamonds`: Added.
39
+ [GitHub#110][Patch by Benson Muite]
40
+
41
+ * `Datasets::ITACorpus`: Added.
42
+ [GitHub#119][Patch by abcdefg-1234567]
43
+
44
+ * `Datasets::KuzushijiMNIST`: Added.
45
+ [GitHub#125][Patch by abcdefg-1234567]
46
+
47
+ * Updated list of datasets in README.
48
+ [GitHub#129][Patch by Benson Muite]
49
+
50
+ * `Datasets::CaliforniaHousing`: Added.
51
+ [GitHub#123][Patch by Benson Muite]
52
+
53
+ * Added support for Ruby 3.1.
54
+ [GitHub#130][Patch by Benson Muite]
55
+
56
+ * `Datasets::AFINN`: Added.
57
+ [GitHub#120][Patch by Benson Muite]
58
+
59
+ * `Datasets::LivedoorNews`: Added.
60
+ [GitHub#127][Patch by abcdefg-1234567]
61
+
62
+ * `Datasets::SeabornDataList`: Added.
63
+ [GitHub#134][Patch by Hirokazu SUZUKI]
64
+
65
+ * `Datasets::WikipediaKyotoJapaneseEnglish`: Added.
66
+ [GitHub#135][Patch by abcdefg-1234567]
67
+
68
+ * Renamed Rdatasets to Rdataset.
69
+ [GitHub#148][Patch by Hirokazu SUZUKI]
70
+
71
+ * Removed support for Ruby 2.6.
72
+
73
+ * Add missing license information.
74
+
75
+ * `Datasets::QuoraDuplicateQuestionPair`: Added.
76
+ [GitHub#149][Patch by otegami]
77
+
78
+ ### Fixes
79
+
80
+ * Fixed key from nil to :index in `Datasets::SeabornData`.
81
+ [GitHub#133][Patch by Hirokazu SUZUKI]
82
+
83
+ * Fixed `Datasets::Rdatasets#each` to change "NA" to nil.
84
+ [GitHub#139][Patch by Hirokazu SUZUKI]
85
+
86
+ * Fix `Datasets::Rdatasets#each` with mixed data of numeric and string.
87
+ [GitHub#140][Patch by Hirokazu SUZUKI]
88
+
89
+ ### Thanks
90
+
91
+ * okadak
92
+
93
+ * Masa
94
+
95
+ * Benson Muite
96
+
97
+ * abcdefg-1234567
98
+
99
+ * Hirokazu SUZUKI
100
+
101
+ * Sutou Kouhei
102
+
103
+ * otegami
104
+
3
105
  ## 0.1.4 - 2021-07-13
4
106
 
5
107
  ### Improvements
@@ -31,7 +31,8 @@ module Datasets
31
31
  @type = type
32
32
  @metadata.id = "adult-#{@type}"
33
33
  @metadata.name = "Adult: #{@type}"
34
- @metadata.url = "http://archive.ics.uci.edu/ml/datasets/adult"
34
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
35
+ @metadata.licenses = ["CC-BY-4.0"]
35
36
  @metadata.description = lambda do
36
37
  read_names
37
38
  end
@@ -58,10 +59,8 @@ module Datasets
58
59
  ext = "test"
59
60
  end
60
61
  data_path = cache_dir_path + "adult-#{ext}.csv"
61
- unless data_path.exist?
62
- data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
- download(data_path, data_url)
64
- end
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
+ download(data_path, data_url)
65
64
 
66
65
  options = {
67
66
  converters: [:numeric, lambda {|f| f.strip}],
@@ -74,10 +73,8 @@ module Datasets
74
73
 
75
74
  def read_names
76
75
  names_path = cache_dir_path + "adult.names"
77
- unless names_path.exist?
78
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
79
- download(names_path, names_url)
80
- end
76
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
77
+ download(names_path, names_url)
81
78
  names_path.read
82
79
  end
83
80
  end
@@ -0,0 +1,48 @@
1
+ require "csv"
2
+ require_relative "zip-extractor"
3
+
4
+ module Datasets
5
+ class AFINN < Dataset
6
+ Record = Struct.new(:word,
7
+ :valence)
8
+
9
+ def initialize
10
+ super()
11
+ @metadata.id = "afinn"
12
+ @metadata.name = "AFINN"
13
+ @metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
14
+ @metadata.licenses = ["ODbL-1.0"]
15
+ @metadata.description = lambda do
16
+ extract_file("AFINN/AFINN-README.txt") do |input|
17
+ readme = input.read
18
+ readme.force_encoding("UTF-8")
19
+ readme.
20
+ gsub(/^AFINN-96:.*?\n\n/m, "").
21
+ gsub(/^In Python.*$/m, "").
22
+ strip
23
+ end
24
+ end
25
+ end
26
+
27
+ def each
28
+ return to_enum(__method__) unless block_given?
29
+
30
+ extract_file("AFINN/AFINN-111.txt") do |input|
31
+ csv = CSV.new(input, col_sep: "\t", converters: :numeric)
32
+ csv.each do |row|
33
+ yield(Record.new(*row))
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+ def extract_file(file_path, &block)
40
+ data_path = cache_dir_path + "imm6010.zip"
41
+ data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
42
+ download(data_path, data_url)
43
+
44
+ extractor = ZipExtractor.new(data_path)
45
+ extractor.extract_file(file_path, &block)
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,196 @@
1
+ require_relative 'dataset'
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ # Dataset for AozoraBunko
6
+ class AozoraBunko < Dataset
7
+ Book = Struct.new(
8
+ # 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
9
+ :title_id,
10
+ :title,
11
+ :title_reading,
12
+ :title_reading_collation,
13
+ :subtitle,
14
+ :subtitle_reading,
15
+ :original_title,
16
+ :first_appearance,
17
+ :ndc_code, # 分類番号(日本十進分類法の番号)
18
+ :syllabary_spelling_type,
19
+ :copyrighted,
20
+ :published_date,
21
+ :last_updated_date,
22
+ :detail_url,
23
+ # 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
24
+ :person_id,
25
+ :person_family_name,
26
+ :person_first_name,
27
+ :person_family_name_reading,
28
+ :person_first_name_reading,
29
+ :person_family_name_reading_collation,
30
+ :person_first_name_reading_collation,
31
+ :person_family_name_romaji,
32
+ :person_first_name_romaji,
33
+ :person_type,
34
+ :person_birthday,
35
+ :person_date_of_death,
36
+ :person_copyrighted,
37
+ # 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
38
+ :original_book_name1,
39
+ :original_book_publisher_name1,
40
+ :original_book_first_published_date1,
41
+ :used_version_for_registration1,
42
+ :used_version_for_proofreading1,
43
+ :base_of_original_book_name1,
44
+ :base_of_original_book_publisher_name1,
45
+ :base_of_original_book_first_published_date1,
46
+ # 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
47
+ :original_book_name2,
48
+ :original_book_publisher_name2,
49
+ :original_book_first_published_date2,
50
+ :used_version_for_registration2,
51
+ :used_version_for_proofreading2,
52
+ :base_of_original_book_name2,
53
+ :base_of_original_book_publisher_name2,
54
+ :base_of_original_book_first_published_date2,
55
+ # 入力者,校正者,
56
+ :registered_person_name,
57
+ :proofreader_name,
58
+ # テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
59
+ :text_file_url,
60
+ :last_text_file_updated_date,
61
+ :text_file_character_encoding,
62
+ :text_file_character_set,
63
+ :text_file_updating_count,
64
+ # XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
65
+ :html_file_url,
66
+ :last_html_file_updated_date,
67
+ :html_file_character_encoding,
68
+ :html_file_character_set,
69
+ :html_file_updating_count
70
+ )
71
+
72
+ class Book
73
+ attr_writer :cache_path
74
+
75
+ def initialize(*args)
76
+ super
77
+ @text = nil
78
+ @html = nil
79
+ @cache_path = nil
80
+ end
81
+
82
+ alias_method :copyrighted?, :copyrighted
83
+ alias_method :person_copyrighted?, :person_copyrighted
84
+
85
+ def text
86
+ return @text unless @text.nil?
87
+ return @text if text_file_url.nil? || text_file_url.empty?
88
+
89
+ # when url is not zip file, it needs to open web page by brower and has to download
90
+ # e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
91
+ return @text unless text_file_url.end_with?('.zip')
92
+
93
+ downloader = Downloader.new(text_file_url)
94
+ downloader.download(text_file_output_path)
95
+
96
+ @text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
97
+ input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
98
+ end
99
+
100
+ @text
101
+ end
102
+
103
+ def html
104
+ return @html unless @html.nil?
105
+ return @html if html_file_url.nil? || html_file_url.empty?
106
+
107
+ downloader = Downloader.new(html_file_url)
108
+ downloader.download(html_file_output_path)
109
+ @html = File.read(html_file_output_path).encode(Encoding::UTF_8,
110
+ normalize_encoding(html_file_character_encoding))
111
+
112
+ @html
113
+ end
114
+
115
+ private
116
+
117
+ def text_file_output_path
118
+ cache_base_dir + text_file_name
119
+ end
120
+
121
+ def html_file_output_path
122
+ cache_base_dir + html_file_name
123
+ end
124
+
125
+ def text_file_name
126
+ text_file_url.split('/').last
127
+ end
128
+
129
+ def html_file_name
130
+ html_file_url.split('/').last
131
+ end
132
+
133
+ def cache_base_dir
134
+ @cache_path.base_dir + title_id + person_id
135
+ end
136
+
137
+ def normalize_encoding(encoding)
138
+ case encoding
139
+ when 'ShiftJIS'
140
+ Encoding::Shift_JIS
141
+ when 'UTF-8'
142
+ Encoding::UTF_8
143
+ else
144
+ encoding
145
+ end
146
+ end
147
+ end
148
+
149
+ def initialize
150
+ super()
151
+
152
+ @metadata.id = 'aozora-bunko'
153
+ @metadata.name = 'Aozora Bunko'
154
+ @metadata.url = 'https://www.aozora.gr.jp/'
155
+ @metadata.licenses = 'CC-BY-2.1-JP'
156
+ @metadata.description = <<~DESCRIPTION
157
+ Aozora Bunko is an activity to collect free electronic books that anyone can access
158
+ on the Internet like a library. The copyrighted works and the works that are said to be
159
+ "free to read" are available after being digitized in text and XHTML (some HTML) formats.
160
+ DESCRIPTION
161
+ end
162
+
163
+ def each
164
+ return to_enum(__method__) unless block_given?
165
+
166
+ open_data do |csv_file_stream|
167
+ text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
168
+
169
+ CSV.parse(text, headers: true) do |row|
170
+ %w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
171
+ row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
172
+ end
173
+ book = Book.new(*row.fields)
174
+ book.cache_path = cache_path
175
+
176
+ yield(book)
177
+ end
178
+ end
179
+ end
180
+
181
+ private
182
+
183
+ def open_data(&block)
184
+ data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
185
+ data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
186
+ download(data_path, data_url)
187
+ ZipExtractor.new(data_path).extract_first_file do |input|
188
+ block.call(input)
189
+ end
190
+ end
191
+
192
+ def normalize_boolean(column_value)
193
+ column_value == 'あり'
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,28 @@
1
+ module Datasets
2
+ class CachePath
3
+ def initialize(id)
4
+ @id = id
5
+ end
6
+
7
+ def base_dir
8
+ Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
9
+ end
10
+
11
+ def remove
12
+ FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
13
+ end
14
+
15
+ private
16
+
17
+ def system_cache_dir
18
+ case RUBY_PLATFORM
19
+ when /mswin/, /mingw/
20
+ ENV['LOCALAPPDATA'] || '~/AppData/Local'
21
+ when /darwin/
22
+ '~/Library/Caches'
23
+ else
24
+ ENV['XDG_CACHE_HOME'] || '~/.cache'
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,60 @@
1
+ require "csv"
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ class CaliforniaHousing < Dataset
6
+ Record = Struct.new(:median_house_value,
7
+ :median_income,
8
+ :housing_median_age,
9
+ :total_rooms,
10
+ :total_bedrooms,
11
+ :population,
12
+ :households,
13
+ :latitude,
14
+ :longitude)
15
+
16
+ def initialize
17
+ super()
18
+ @metadata.id = "california-housing"
19
+ @metadata.name = "California Housing"
20
+ @metadata.url = "http://lib.stat.cmu.edu/datasets/"
21
+ @metadata.licenses = ["CCO"]
22
+ @metadata.description = <<-DESCRIPTION
23
+ Housing information from the 1990 census used in
24
+ Pace, R. Kelley and Ronald Barry,
25
+ "Sparse Spatial Autoregressions",
26
+ Statistics and Probability Letters, 33 (1997) 291-297.
27
+ Available from http://lib.stat.cmu.edu/datasets/.
28
+ DESCRIPTION
29
+ end
30
+
31
+ def each
32
+ return to_enum(__method__) unless block_given?
33
+
34
+ data_path = cache_dir_path + "houses.zip"
35
+ data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
36
+ file_name = "cadata.txt"
37
+ download(data_path, data_url)
38
+ open_data(data_path, file_name) do |input|
39
+ data = ""
40
+ input.each_line do |line|
41
+ next unless line.start_with?(" ")
42
+ data << line.lstrip.gsub(/ +/, ",")
43
+ end
44
+ options = {
45
+ converters: [:numeric],
46
+ }
47
+ CSV.parse(data, **options) do |row|
48
+ yield(Record.new(*row))
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+ def open_data(data_path, file_name)
55
+ ZipExtractor.new(data_path).extract_first_file do |input|
56
+ yield input
57
+ end
58
+ end
59
+ end
60
+ end
@@ -50,10 +50,8 @@ module Datasets
50
50
  return to_enum(__method__) unless block_given?
51
51
 
52
52
  data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
53
- unless data_path.exist?
54
- data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
55
- download(data_path, data_url)
56
- end
53
+ data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
54
+ download(data_path, data_url)
57
55
 
58
56
  parse_data(data_path, &block)
59
57
  end
@@ -42,10 +42,8 @@ module Datasets
42
42
  private
43
43
  def open_data
44
44
  data_path = cache_dir_path + "plurals.xml"
45
- unless data_path.exist?
46
- download(data_path, @metadata.url)
47
- end
48
- ::File.open(data_path) do |input|
45
+ download(data_path, @metadata.url)
46
+ data_path.open do |input|
49
47
  yield(input)
50
48
  end
51
49
  end
@@ -140,6 +140,7 @@ module Datasets
140
140
  @metadata.id = "communities"
141
141
  @metadata.name = "Communities"
142
142
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
143
+ @metadata.licenses = ["CC-BY-4.0"]
143
144
  @metadata.description = lambda do
144
145
  read_names
145
146
  end
@@ -177,10 +178,8 @@ module Datasets
177
178
 
178
179
  def open_data
179
180
  data_path = cache_dir_path + "communities.data"
180
- unless data_path.exist?
181
- data_url = "#{base_url}/communities.data"
182
- download(data_path, data_url)
183
- end
181
+ data_url = "#{base_url}/communities.data"
182
+ download(data_path, data_url)
184
183
  CSV.open(data_path) do |csv|
185
184
  yield(csv)
186
185
  end
@@ -188,10 +187,8 @@ module Datasets
188
187
 
189
188
  def read_names
190
189
  names_path = cache_dir_path + "communities.names"
191
- unless names_path.exist?
192
- names_url = "#{base_url}/communities.names"
193
- download(names_path, names_url)
194
- end
190
+ names_url = "#{base_url}/communities.names"
191
+ download(names_path, names_url)
195
192
  names_path.read
196
193
  end
197
194
  end