red-datasets 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8d18fa976f1b368a6a3f9cc85dc7a58a1785fd02901157672484f2a7d8b1fa88
4
- data.tar.gz: c91d651a0d8de6722ee759ce29545f5f382d1e9f060c7e4ee5a0fcd557be4d21
3
+ metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
4
+ data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
5
5
  SHA512:
6
- metadata.gz: 0ff7694dd27e4293206de81fc2a7b5ccccb886579ed73eb7f97d390472692ce310993e2ece741cf85f5fbe265f1deb2a7ea326590383b4bdf0d3f77f10b1bbc1
7
- data.tar.gz: 38ac6aa12d3e33ab0c26c0750273b60386d90fd4d916776a0d561c3f25a79fa2d7d216ac465842207cd65f62e2fcbd348389e65f905583187fe23c30908d92dc
6
+ metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
7
+ data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252
data/README.md CHANGED
@@ -1,6 +1,5 @@
1
1
  # Red Datasets
2
2
 
3
- [![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
4
3
  [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
5
4
 
6
5
  ## Description
@@ -17,15 +16,30 @@ You can use datasets easily because you can access each dataset with multiple wa
17
16
 
18
17
  ## Available datasets
19
18
 
20
- TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
21
-
22
19
  * Adult Dataset
20
+ * Aozora Bunko
21
+ * California Housing
23
22
  * CIFAR-10 Dataset
24
23
  * CIFAR-100 Dataset
24
+ * CLDR language plural rules
25
+ * Communities and crime
26
+ * Diamonds Dataset
27
+ * E-Stat Japan
25
28
  * Fashion-MNIST
29
+ * Fuel Economy Dataset
30
+ * Geolonia Japanese Addresses
31
+ * Hepatitis
26
32
  * Iris Dataset
33
+ * Libsvm
27
34
  * MNIST database
35
+ * Mushroom
36
+ * Penguins
28
37
  * The Penn Treebank Project
38
+ * PMJT - Pre-Modern Japanese Text dataset list
39
+ * Postal Codes in Japan
40
+ * Rdatasets
41
+ * Seaborn
42
+ * Sudachi Synonym Dictionary
29
43
  * Wikipedia
30
44
  * Wine Dataset
31
45
 
@@ -135,6 +149,12 @@ end
135
149
 
136
150
  * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
137
151
 
152
+ ## How to develop Red Datasets
153
+ 1. Fork https://github.com/red-data-tools/red-datasets
154
+ 2. Create a feature branch from master
155
+ 3. Develop in the feature branch
156
+ 4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
157
+
138
158
  ## License
139
159
 
140
160
  The MIT license. See `LICENSE.txt` for details.
data/Rakefile CHANGED
@@ -13,9 +13,64 @@ end
13
13
  helper.install
14
14
  spec = helper.gemspec
15
15
 
16
+ task default: :test
17
+
16
18
  desc "Run tests"
17
19
  task :test do
18
20
  ruby("test/run-test.rb")
19
21
  end
20
22
 
21
- task default: :test
23
+ desc "Generate an artifact for GitHub Pages"
24
+ task :pages do
25
+ pages_dir = "_site"
26
+ rm_rf(pages_dir)
27
+ mkdir_p(pages_dir)
28
+
29
+ require "cgi/util"
30
+ require_relative "lib/datasets/lazy"
31
+ File.open("#{pages_dir}/index.html", "w") do |index_html|
32
+ index_html.puts(<<-HTML)
33
+ <!DOCTYPE html>
34
+ <html>
35
+ <head>
36
+ <meta charset="UTF-8">
37
+ <title>Red Datasets</title>
38
+ <style>
39
+ table {
40
+ margin-left: 20vw;
41
+ min-width: 50%;
42
+ }
43
+ th {
44
+ font-size: 30px;
45
+ padding: 20px;
46
+ }
47
+ td {
48
+ border-bottom: 1px solid #D9DCE0;
49
+ padding: 20px;
50
+ font-weight: bold;
51
+ }
52
+ </style>
53
+ </head>
54
+ <body>
55
+ <section>
56
+ <h1>Red Datasets</h1>
57
+ <table>
58
+ <thead>
59
+ <tr><th>Available datasets</th></tr>
60
+ </thead>
61
+ <tbody>
62
+ HTML
63
+ Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
64
+ index_html.puts(<<-HTML)
65
+ <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
66
+ HTML
67
+ end
68
+ index_html.puts(<<-HTML)
69
+ </tbody>
70
+ </table>
71
+ </section>
72
+ </body>
73
+ </html>
74
+ HTML
75
+ end
76
+ end
data/doc/text/news.md CHANGED
@@ -1,5 +1,107 @@
1
1
  # News
2
2
 
3
+ ## 0.1.6 - 2023-05-24
4
+
5
+ ### Improvements
6
+
7
+ * Added support for lazy loading by `require "datasets/lazy"`.
8
+
9
+ * `Datasets::NagoyaUniversityConversationCorpus`: Added.
10
+ [GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
11
+ [Patch by matsuura]
12
+
13
+ * `Datasets::Wikipedia`: Added support for downloading in background.
14
+
15
+ ### Thanks
16
+
17
+ * matsuura
18
+
19
+ ## 0.1.5 - 2022-09-22
20
+
21
+ ### Improvements
22
+
23
+ * `Datasets::PMJTDatasetList`: Added.
24
+ [GitHub#107][Patch by okadak]
25
+
26
+ * `Datasets::AozoraBunko`: Added.
27
+ [GitHub#108][Patch by Masa]
28
+
29
+ * Added how to develop to README
30
+ [GitHub#117][Patch by abcdefg-1234567]
31
+
32
+ * `Datasets::FuelEconomy`: Added.
33
+ [GitHub#114][Patch by Benson Muite]
34
+
35
+ * `Datasets::Geolonia`: Added.
36
+ [GitHub#118][Patch by abcdefg-1234567]
37
+
38
+ * `Datasets::Diamonds`: Added.
39
+ [GitHub#110][Patch by Benson Muite]
40
+
41
+ * `Datasets::ITACorpus`: Added.
42
+ [GitHub#119][Patch by abcdefg-1234567]
43
+
44
+ * `Datasets::KuzushijiMNIST`: Added.
45
+ [GitHub#125][Patch by abcdefg-1234567]
46
+
47
+ * Updated list of datasets in README.
48
+ [GitHub#129][Patch by Benson Muite]
49
+
50
+ * `Datasets::CaliforniaHousing`: Added.
51
+ [GitHub#123][Patch by Benson Muite]
52
+
53
+ * Added support for Ruby 3.1.
54
+ [GitHub#130][Patch by Benson Muite]
55
+
56
+ * `Datasets::AFINN`: Added.
57
+ [GitHub#120][Patch by Benson Muite]
58
+
59
+ * `Datasets::LivedoorNews`: Added.
60
+ [GitHub#127][Patch by abcdefg-1234567]
61
+
62
+ * `Datasets::SeabornDataList`: Added.
63
+ [GitHub#134][Patch by Hirokazu SUZUKI]
64
+
65
+ * `Datasets::WikipediaKyotoJapaneseEnglish`: Added.
66
+ [GitHub#135][Patch by abcdefg-1234567]
67
+
68
+ * Renamed Rdatasets to Rdataset.
69
+ [GitHub#148][Patch by Hirokazu SUZUKI]
70
+
71
+ * Removed support for Ruby 2.6.
72
+
73
+ * Add missing license information.
74
+
75
+ * `Datasets::QuoraDuplicateQuestionPair`: Added.
76
+ [GitHub#149][Patch by otegami]
77
+
78
+ ### Fixes
79
+
80
+ * Fixed key from nil to :index in `Datasets::SeabornData`.
81
+ [GitHub#133][Patch by Hirokazu SUZUKI]
82
+
83
+ * Fixed `Datasets::Rdatasets#each` to change "NA" to nil.
84
+ [GitHub#139][Patch by Hirokazu SUZUKI]
85
+
86
+ * Fix `Datasets::Rdatasets#each` with mixed data of numeric and string.
87
+ [GitHub#140][Patch by Hirokazu SUZUKI]
88
+
89
+ ### Thanks
90
+
91
+ * okadak
92
+
93
+ * Masa
94
+
95
+ * Benson Muite
96
+
97
+ * abcdefg-1234567
98
+
99
+ * Hirokazu SUZUKI
100
+
101
+ * Sutou Kouhei
102
+
103
+ * otegami
104
+
3
105
  ## 0.1.4 - 2021-07-13
4
106
 
5
107
  ### Improvements
@@ -31,7 +31,8 @@ module Datasets
31
31
  @type = type
32
32
  @metadata.id = "adult-#{@type}"
33
33
  @metadata.name = "Adult: #{@type}"
34
- @metadata.url = "http://archive.ics.uci.edu/ml/datasets/adult"
34
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
35
+ @metadata.licenses = ["CC-BY-4.0"]
35
36
  @metadata.description = lambda do
36
37
  read_names
37
38
  end
@@ -58,10 +59,8 @@ module Datasets
58
59
  ext = "test"
59
60
  end
60
61
  data_path = cache_dir_path + "adult-#{ext}.csv"
61
- unless data_path.exist?
62
- data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
- download(data_path, data_url)
64
- end
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
+ download(data_path, data_url)
65
64
 
66
65
  options = {
67
66
  converters: [:numeric, lambda {|f| f.strip}],
@@ -74,10 +73,8 @@ module Datasets
74
73
 
75
74
  def read_names
76
75
  names_path = cache_dir_path + "adult.names"
77
- unless names_path.exist?
78
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
79
- download(names_path, names_url)
80
- end
76
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
77
+ download(names_path, names_url)
81
78
  names_path.read
82
79
  end
83
80
  end
@@ -0,0 +1,48 @@
1
+ require "csv"
2
+ require_relative "zip-extractor"
3
+
4
+ module Datasets
5
+ class AFINN < Dataset
6
+ Record = Struct.new(:word,
7
+ :valence)
8
+
9
+ def initialize
10
+ super()
11
+ @metadata.id = "afinn"
12
+ @metadata.name = "AFINN"
13
+ @metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
14
+ @metadata.licenses = ["ODbL-1.0"]
15
+ @metadata.description = lambda do
16
+ extract_file("AFINN/AFINN-README.txt") do |input|
17
+ readme = input.read
18
+ readme.force_encoding("UTF-8")
19
+ readme.
20
+ gsub(/^AFINN-96:.*?\n\n/m, "").
21
+ gsub(/^In Python.*$/m, "").
22
+ strip
23
+ end
24
+ end
25
+ end
26
+
27
+ def each
28
+ return to_enum(__method__) unless block_given?
29
+
30
+ extract_file("AFINN/AFINN-111.txt") do |input|
31
+ csv = CSV.new(input, col_sep: "\t", converters: :numeric)
32
+ csv.each do |row|
33
+ yield(Record.new(*row))
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+ def extract_file(file_path, &block)
40
+ data_path = cache_dir_path + "imm6010.zip"
41
+ data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
42
+ download(data_path, data_url)
43
+
44
+ extractor = ZipExtractor.new(data_path)
45
+ extractor.extract_file(file_path, &block)
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,196 @@
1
+ require_relative 'dataset'
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ # Dataset for AozoraBunko
6
+ class AozoraBunko < Dataset
7
+ Book = Struct.new(
8
+ # 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
9
+ :title_id,
10
+ :title,
11
+ :title_reading,
12
+ :title_reading_collation,
13
+ :subtitle,
14
+ :subtitle_reading,
15
+ :original_title,
16
+ :first_appearance,
17
+ :ndc_code, # 分類番号(日本十進分類法の番号)
18
+ :syllabary_spelling_type,
19
+ :copyrighted,
20
+ :published_date,
21
+ :last_updated_date,
22
+ :detail_url,
23
+ # 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
24
+ :person_id,
25
+ :person_family_name,
26
+ :person_first_name,
27
+ :person_family_name_reading,
28
+ :person_first_name_reading,
29
+ :person_family_name_reading_collation,
30
+ :person_first_name_reading_collation,
31
+ :person_family_name_romaji,
32
+ :person_first_name_romaji,
33
+ :person_type,
34
+ :person_birthday,
35
+ :person_date_of_death,
36
+ :person_copyrighted,
37
+ # 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
38
+ :original_book_name1,
39
+ :original_book_publisher_name1,
40
+ :original_book_first_published_date1,
41
+ :used_version_for_registration1,
42
+ :used_version_for_proofreading1,
43
+ :base_of_original_book_name1,
44
+ :base_of_original_book_publisher_name1,
45
+ :base_of_original_book_first_published_date1,
46
+ # 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
47
+ :original_book_name2,
48
+ :original_book_publisher_name2,
49
+ :original_book_first_published_date2,
50
+ :used_version_for_registration2,
51
+ :used_version_for_proofreading2,
52
+ :base_of_original_book_name2,
53
+ :base_of_original_book_publisher_name2,
54
+ :base_of_original_book_first_published_date2,
55
+ # 入力者,校正者,
56
+ :registered_person_name,
57
+ :proofreader_name,
58
+ # テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
59
+ :text_file_url,
60
+ :last_text_file_updated_date,
61
+ :text_file_character_encoding,
62
+ :text_file_character_set,
63
+ :text_file_updating_count,
64
+ # XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
65
+ :html_file_url,
66
+ :last_html_file_updated_date,
67
+ :html_file_character_encoding,
68
+ :html_file_character_set,
69
+ :html_file_updating_count
70
+ )
71
+
72
+ class Book
73
+ attr_writer :cache_path
74
+
75
+ def initialize(*args)
76
+ super
77
+ @text = nil
78
+ @html = nil
79
+ @cache_path = nil
80
+ end
81
+
82
+ alias_method :copyrighted?, :copyrighted
83
+ alias_method :person_copyrighted?, :person_copyrighted
84
+
85
+ def text
86
+ return @text unless @text.nil?
87
+ return @text if text_file_url.nil? || text_file_url.empty?
88
+
89
+ # when url is not zip file, it needs to open web page by brower and has to download
90
+ # e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
91
+ return @text unless text_file_url.end_with?('.zip')
92
+
93
+ downloader = Downloader.new(text_file_url)
94
+ downloader.download(text_file_output_path)
95
+
96
+ @text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
97
+ input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
98
+ end
99
+
100
+ @text
101
+ end
102
+
103
+ def html
104
+ return @html unless @html.nil?
105
+ return @html if html_file_url.nil? || html_file_url.empty?
106
+
107
+ downloader = Downloader.new(html_file_url)
108
+ downloader.download(html_file_output_path)
109
+ @html = File.read(html_file_output_path).encode(Encoding::UTF_8,
110
+ normalize_encoding(html_file_character_encoding))
111
+
112
+ @html
113
+ end
114
+
115
+ private
116
+
117
+ def text_file_output_path
118
+ cache_base_dir + text_file_name
119
+ end
120
+
121
+ def html_file_output_path
122
+ cache_base_dir + html_file_name
123
+ end
124
+
125
+ def text_file_name
126
+ text_file_url.split('/').last
127
+ end
128
+
129
+ def html_file_name
130
+ html_file_url.split('/').last
131
+ end
132
+
133
+ def cache_base_dir
134
+ @cache_path.base_dir + title_id + person_id
135
+ end
136
+
137
+ def normalize_encoding(encoding)
138
+ case encoding
139
+ when 'ShiftJIS'
140
+ Encoding::Shift_JIS
141
+ when 'UTF-8'
142
+ Encoding::UTF_8
143
+ else
144
+ encoding
145
+ end
146
+ end
147
+ end
148
+
149
+ def initialize
150
+ super()
151
+
152
+ @metadata.id = 'aozora-bunko'
153
+ @metadata.name = 'Aozora Bunko'
154
+ @metadata.url = 'https://www.aozora.gr.jp/'
155
+ @metadata.licenses = 'CC-BY-2.1-JP'
156
+ @metadata.description = <<~DESCRIPTION
157
+ Aozora Bunko is an activity to collect free electronic books that anyone can access
158
+ on the Internet like a library. The copyrighted works and the works that are said to be
159
+ "free to read" are available after being digitized in text and XHTML (some HTML) formats.
160
+ DESCRIPTION
161
+ end
162
+
163
+ def each
164
+ return to_enum(__method__) unless block_given?
165
+
166
+ open_data do |csv_file_stream|
167
+ text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
168
+
169
+ CSV.parse(text, headers: true) do |row|
170
+ %w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
171
+ row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
172
+ end
173
+ book = Book.new(*row.fields)
174
+ book.cache_path = cache_path
175
+
176
+ yield(book)
177
+ end
178
+ end
179
+ end
180
+
181
+ private
182
+
183
+ def open_data(&block)
184
+ data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
185
+ data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
186
+ download(data_path, data_url)
187
+ ZipExtractor.new(data_path).extract_first_file do |input|
188
+ block.call(input)
189
+ end
190
+ end
191
+
192
+ def normalize_boolean(column_value)
193
+ column_value == 'あり'
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,28 @@
1
+ module Datasets
2
+ class CachePath
3
+ def initialize(id)
4
+ @id = id
5
+ end
6
+
7
+ def base_dir
8
+ Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
9
+ end
10
+
11
+ def remove
12
+ FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
13
+ end
14
+
15
+ private
16
+
17
+ def system_cache_dir
18
+ case RUBY_PLATFORM
19
+ when /mswin/, /mingw/
20
+ ENV['LOCALAPPDATA'] || '~/AppData/Local'
21
+ when /darwin/
22
+ '~/Library/Caches'
23
+ else
24
+ ENV['XDG_CACHE_HOME'] || '~/.cache'
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,60 @@
1
+ require "csv"
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ class CaliforniaHousing < Dataset
6
+ Record = Struct.new(:median_house_value,
7
+ :median_income,
8
+ :housing_median_age,
9
+ :total_rooms,
10
+ :total_bedrooms,
11
+ :population,
12
+ :households,
13
+ :latitude,
14
+ :longitude)
15
+
16
+ def initialize
17
+ super()
18
+ @metadata.id = "california-housing"
19
+ @metadata.name = "California Housing"
20
+ @metadata.url = "http://lib.stat.cmu.edu/datasets/"
21
+ @metadata.licenses = ["CCO"]
22
+ @metadata.description = <<-DESCRIPTION
23
+ Housing information from the 1990 census used in
24
+ Pace, R. Kelley and Ronald Barry,
25
+ "Sparse Spatial Autoregressions",
26
+ Statistics and Probability Letters, 33 (1997) 291-297.
27
+ Available from http://lib.stat.cmu.edu/datasets/.
28
+ DESCRIPTION
29
+ end
30
+
31
+ def each
32
+ return to_enum(__method__) unless block_given?
33
+
34
+ data_path = cache_dir_path + "houses.zip"
35
+ data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
36
+ file_name = "cadata.txt"
37
+ download(data_path, data_url)
38
+ open_data(data_path, file_name) do |input|
39
+ data = ""
40
+ input.each_line do |line|
41
+ next unless line.start_with?(" ")
42
+ data << line.lstrip.gsub(/ +/, ",")
43
+ end
44
+ options = {
45
+ converters: [:numeric],
46
+ }
47
+ CSV.parse(data, **options) do |row|
48
+ yield(Record.new(*row))
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+ def open_data(data_path, file_name)
55
+ ZipExtractor.new(data_path).extract_first_file do |input|
56
+ yield input
57
+ end
58
+ end
59
+ end
60
+ end
@@ -50,10 +50,8 @@ module Datasets
50
50
  return to_enum(__method__) unless block_given?
51
51
 
52
52
  data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
53
- unless data_path.exist?
54
- data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
55
- download(data_path, data_url)
56
- end
53
+ data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
54
+ download(data_path, data_url)
57
55
 
58
56
  parse_data(data_path, &block)
59
57
  end
@@ -42,10 +42,8 @@ module Datasets
42
42
  private
43
43
  def open_data
44
44
  data_path = cache_dir_path + "plurals.xml"
45
- unless data_path.exist?
46
- download(data_path, @metadata.url)
47
- end
48
- ::File.open(data_path) do |input|
45
+ download(data_path, @metadata.url)
46
+ data_path.open do |input|
49
47
  yield(input)
50
48
  end
51
49
  end
@@ -140,6 +140,7 @@ module Datasets
140
140
  @metadata.id = "communities"
141
141
  @metadata.name = "Communities"
142
142
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
143
+ @metadata.licenses = ["CC-BY-4.0"]
143
144
  @metadata.description = lambda do
144
145
  read_names
145
146
  end
@@ -177,10 +178,8 @@ module Datasets
177
178
 
178
179
  def open_data
179
180
  data_path = cache_dir_path + "communities.data"
180
- unless data_path.exist?
181
- data_url = "#{base_url}/communities.data"
182
- download(data_path, data_url)
183
- end
181
+ data_url = "#{base_url}/communities.data"
182
+ download(data_path, data_url)
184
183
  CSV.open(data_path) do |csv|
185
184
  yield(csv)
186
185
  end
@@ -188,10 +187,8 @@ module Datasets
188
187
 
189
188
  def read_names
190
189
  names_path = cache_dir_path + "communities.names"
191
- unless names_path.exist?
192
- names_url = "#{base_url}/communities.names"
193
- download(names_path, names_url)
194
- end
190
+ names_url = "#{base_url}/communities.names"
191
+ download(names_path, names_url)
195
192
  names_path.read
196
193
  end
197
194
  end