red-datasets 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -2
  3. data/doc/text/news.md +86 -0
  4. data/lib/datasets/adult.rb +6 -9
  5. data/lib/datasets/afinn.rb +48 -0
  6. data/lib/datasets/aozora-bunko.rb +196 -0
  7. data/lib/datasets/cache-path.rb +28 -0
  8. data/lib/datasets/california-housing.rb +60 -0
  9. data/lib/datasets/cifar.rb +2 -4
  10. data/lib/datasets/cldr-plurals.rb +2 -4
  11. data/lib/datasets/communities.rb +5 -8
  12. data/lib/datasets/dataset.rb +8 -12
  13. data/lib/datasets/diamonds.rb +26 -0
  14. data/lib/datasets/downloader.rb +6 -1
  15. data/lib/datasets/e-stat-japan.rb +2 -1
  16. data/lib/datasets/fashion-mnist.rb +4 -0
  17. data/lib/datasets/fuel-economy.rb +35 -0
  18. data/lib/datasets/geolonia.rb +67 -0
  19. data/lib/datasets/ggplot2-dataset.rb +79 -0
  20. data/lib/datasets/hepatitis.rb +5 -8
  21. data/lib/datasets/iris.rb +5 -8
  22. data/lib/datasets/ita-corpus.rb +57 -0
  23. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  24. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  25. data/lib/datasets/libsvm.rb +3 -4
  26. data/lib/datasets/license.rb +26 -0
  27. data/lib/datasets/livedoor-news.rb +80 -0
  28. data/lib/datasets/metadata.rb +14 -0
  29. data/lib/datasets/mnist.rb +7 -7
  30. data/lib/datasets/mushroom.rb +5 -8
  31. data/lib/datasets/penguins.rb +4 -8
  32. data/lib/datasets/penn-treebank.rb +2 -4
  33. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  34. data/lib/datasets/postal-code-japan.rb +2 -6
  35. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  36. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  37. data/lib/datasets/seaborn.rb +90 -0
  38. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  39. data/lib/datasets/version.rb +1 -1
  40. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  41. data/lib/datasets/wikipedia.rb +4 -5
  42. data/lib/datasets/wine.rb +6 -9
  43. data/lib/datasets/zip-extractor.rb +36 -0
  44. data/lib/datasets.rb +14 -2
  45. data/red-datasets.gemspec +1 -1
  46. data/test/helper.rb +21 -0
  47. data/test/test-afinn.rb +60 -0
  48. data/test/test-aozora-bunko.rb +190 -0
  49. data/test/test-california-housing.rb +56 -0
  50. data/test/test-cldr-plurals.rb +1 -1
  51. data/test/test-dataset.rb +15 -7
  52. data/test/test-diamonds.rb +71 -0
  53. data/test/test-fuel-economy.rb +75 -0
  54. data/test/test-geolonia.rb +64 -0
  55. data/test/test-ita-corpus.rb +69 -0
  56. data/test/test-kuzushiji-mnist.rb +137 -0
  57. data/test/test-license.rb +24 -0
  58. data/test/test-livedoor-news.rb +351 -0
  59. data/test/test-metadata.rb +36 -0
  60. data/test/test-penguins.rb +1 -1
  61. data/test/test-pmjt-dataset-list.rb +50 -0
  62. data/test/test-quora-duplicate-question-pair.rb +33 -0
  63. data/test/test-rdataset.rb +246 -0
  64. data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
  65. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  66. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  67. metadata +58 -14
  68. data/lib/datasets/seaborn-data.rb +0 -49
  69. data/test/test-rdatasets.rb +0 -136
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8d18fa976f1b368a6a3f9cc85dc7a58a1785fd02901157672484f2a7d8b1fa88
4
- data.tar.gz: c91d651a0d8de6722ee759ce29545f5f382d1e9f060c7e4ee5a0fcd557be4d21
3
+ metadata.gz: c1cfd18b589e4624178d9010ef68a100bb6e2573ccf18a9f96168af786523578
4
+ data.tar.gz: 67eddd22e10bf78c0b2cf10b18de289368d473d7b5ddf2a557cc2264834e32b0
5
5
  SHA512:
6
- metadata.gz: 0ff7694dd27e4293206de81fc2a7b5ccccb886579ed73eb7f97d390472692ce310993e2ece741cf85f5fbe265f1deb2a7ea326590383b4bdf0d3f77f10b1bbc1
7
- data.tar.gz: 38ac6aa12d3e33ab0c26c0750273b60386d90fd4d916776a0d561c3f25a79fa2d7d216ac465842207cd65f62e2fcbd348389e65f905583187fe23c30908d92dc
6
+ metadata.gz: 111243d3a1d3d758196bb71301ccb0f34beb1f5bec7c5c14b15f7c96fd6bdde924e30d90d3ace9e9258074411c9f7e7b4ef6bd9338dc5c11349534b2392f6f81
7
+ data.tar.gz: 9a9b426c753bd7e6cc12d452d61b90c2422fcad3b3c353a552c5c05a7c7fd53c3d4ac9cec2e33af1537d9e76e04f1df3d6d9b4baf043528fdde2ab4f9f203e9f
data/README.md CHANGED
@@ -17,15 +17,30 @@ You can use datasets easily because you can access each dataset with multiple wa
17
17
 
18
18
  ## Available datasets
19
19
 
20
- TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
21
-
22
20
  * Adult Dataset
21
+ * Aozora Bunko
22
+ * California Housing
23
23
  * CIFAR-10 Dataset
24
24
  * CIFAR-100 Dataset
25
+ * CLDR language plural rules
26
+ * Communities and crime
27
+ * Diamonds Dataset
28
+ * E-Stat Japan
25
29
  * Fashion-MNIST
30
+ * Fuel Economy Dataset
31
+ * Geolonia Japanese Addresses
32
+ * Hepatitis
26
33
  * Iris Dataset
34
+ * Libsvm
27
35
  * MNIST database
36
+ * Mushroom
37
+ * Penguins
28
38
  * The Penn Treebank Project
39
+ * PMJT - Pre-Modern Japanese Text dataset list
40
+ * Postal Codes in Japan
41
+ * Rdatasets
42
+ * Seaborn
43
+ * Sudachi Synonym Dictionary
29
44
  * Wikipedia
30
45
  * Wine Dataset
31
46
 
@@ -135,6 +150,12 @@ end
135
150
 
136
151
  * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
137
152
 
153
+ ## How to develop Red Datasets
154
+ 1. Fork https://github.com/red-data-tools/red-datasets
155
+ 2. Create a feature branch from master
156
+ 3. Develop in the feature branch
157
+ 4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
158
+
138
159
  ## License
139
160
 
140
161
  The MIT license. See `LICENSE.txt` for details.
data/doc/text/news.md CHANGED
@@ -1,5 +1,91 @@
1
1
  # News
2
2
 
3
+ ## 0.1.5 - 2022-09-22
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::PMJTDatasetList`: Added.
8
+ [GitHub#107][Patch by okadak]
9
+
10
+ * `Datasets::AozoraBunko`: Added.
11
+ [GitHub#108][Patch by Masa]
12
+
13
+ * Added how to develop to README
14
+ [GitHub#117][Patch by abcdefg-1234567]
15
+
16
+ * `Datasets::FuelEconomy`: Added.
17
+ [GitHub#114][Patch by Benson Muite]
18
+
19
+ * `Datasets::Geolonia`: Added.
20
+ [GitHub#118][Patch by abcdefg-1234567]
21
+
22
+ * `Datasets::Diamonds`: Added.
23
+ [GitHub#110][Patch by Benson Muite]
24
+
25
+ * `Datasets::ITACorpus`: Added.
26
+ [GitHub#119][Patch by abcdefg-1234567]
27
+
28
+ * `Datasets::KuzushijiMNIST`: Added.
29
+ [GitHub#125][Patch by abcdefg-1234567]
30
+
31
+ * Updated list of datasets in README.
32
+ [GitHub#129][Patch by Benson Muite]
33
+
34
+ * `Datasets::CaliforniaHousing`: Added.
35
+ [GitHub#123][Patch by Benson Muite]
36
+
37
+ * Added support for Ruby 3.1.
38
+ [GitHub#130][Patch by Benson Muite]
39
+
40
+ * `Datasets::AFINN`: Added.
41
+ [GitHub#120][Patch by Benson Muite]
42
+
43
+ * `Datasets::LivedoorNews`: Added.
44
+ [GitHub#127][Patch by abcdefg-1234567]
45
+
46
+ * `Datasets::SeabornDataList`: Added.
47
+ [GitHub#134][Patch by Hirokazu SUZUKI]
48
+
49
+ * `Datasets::WikipediaKyotoJapaneseEnglish`: Added.
50
+ [GitHub#135][Patch by abcdefg-1234567]
51
+
52
+ * Renamed Rdatasets to Rdataset.
53
+ [GitHub#148][Patch by Hirokazu SUZUKI]
54
+
55
+ * Removed support for Ruby 2.6.
56
+
57
+ * Add missing license information.
58
+
59
+ * `Datasets::QuoraDuplicateQuestionPair`: Added.
60
+ [GitHub#149][Patch by otegami]
61
+
62
+ ### Fixes
63
+
64
+ * Fixed key from nil to :index in `Datasets::SeabornData`.
65
+ [GitHub#133][Patch by Hirokazu SUZUKI]
66
+
67
+ * Fixed `Datasets::Rdatasets#each` to change "NA" to nil.
68
+ [GitHub#139][Patch by Hirokazu SUZUKI]
69
+
70
+ * Fix `Datasets::Rdatasets#each` with mixed data of numeric and string.
71
+ [GitHub#140][Patch by Hirokazu SUZUKI]
72
+
73
+ ### Thanks
74
+
75
+ * okadak
76
+
77
+ * Masa
78
+
79
+ * Benson Muite
80
+
81
+ * abcdefg-1234567
82
+
83
+ * Hirokazu SUZUKI
84
+
85
+ * Sutou Kouhei
86
+
87
+ * otegami
88
+
3
89
  ## 0.1.4 - 2021-07-13
4
90
 
5
91
  ### Improvements
@@ -31,7 +31,8 @@ module Datasets
31
31
  @type = type
32
32
  @metadata.id = "adult-#{@type}"
33
33
  @metadata.name = "Adult: #{@type}"
34
- @metadata.url = "http://archive.ics.uci.edu/ml/datasets/adult"
34
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
35
+ @metadata.licenses = ["CC-BY-4.0"]
35
36
  @metadata.description = lambda do
36
37
  read_names
37
38
  end
@@ -58,10 +59,8 @@ module Datasets
58
59
  ext = "test"
59
60
  end
60
61
  data_path = cache_dir_path + "adult-#{ext}.csv"
61
- unless data_path.exist?
62
- data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
- download(data_path, data_url)
64
- end
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
+ download(data_path, data_url)
65
64
 
66
65
  options = {
67
66
  converters: [:numeric, lambda {|f| f.strip}],
@@ -74,10 +73,8 @@ module Datasets
74
73
 
75
74
  def read_names
76
75
  names_path = cache_dir_path + "adult.names"
77
- unless names_path.exist?
78
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
79
- download(names_path, names_url)
80
- end
76
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
77
+ download(names_path, names_url)
81
78
  names_path.read
82
79
  end
83
80
  end
@@ -0,0 +1,48 @@
1
+ require "csv"
2
+ require_relative "zip-extractor"
3
+
4
+ module Datasets
5
+ class AFINN < Dataset
6
+ Record = Struct.new(:word,
7
+ :valence)
8
+
9
+ def initialize
10
+ super()
11
+ @metadata.id = "afinn"
12
+ @metadata.name = "AFINN"
13
+ @metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
14
+ @metadata.licenses = ["ODbL-1.0"]
15
+ @metadata.description = lambda do
16
+ extract_file("AFINN/AFINN-README.txt") do |input|
17
+ readme = input.read
18
+ readme.force_encoding("UTF-8")
19
+ readme.
20
+ gsub(/^AFINN-96:.*?\n\n/m, "").
21
+ gsub(/^In Python.*$/m, "").
22
+ strip
23
+ end
24
+ end
25
+ end
26
+
27
+ def each
28
+ return to_enum(__method__) unless block_given?
29
+
30
+ extract_file("AFINN/AFINN-111.txt") do |input|
31
+ csv = CSV.new(input, col_sep: "\t", converters: :numeric)
32
+ csv.each do |row|
33
+ yield(Record.new(*row))
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+ def extract_file(file_path, &block)
40
+ data_path = cache_dir_path + "imm6010.zip"
41
+ data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
42
+ download(data_path, data_url)
43
+
44
+ extractor = ZipExtractor.new(data_path)
45
+ extractor.extract_file(file_path, &block)
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,196 @@
1
+ require_relative 'dataset'
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ # Dataset for AozoraBunko
6
+ class AozoraBunko < Dataset
7
+ Book = Struct.new(
8
+ # 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
9
+ :title_id,
10
+ :title,
11
+ :title_reading,
12
+ :title_reading_collation,
13
+ :subtitle,
14
+ :subtitle_reading,
15
+ :original_title,
16
+ :first_appearance,
17
+ :ndc_code, # 分類番号(日本十進分類法の番号)
18
+ :syllabary_spelling_type,
19
+ :copyrighted,
20
+ :published_date,
21
+ :last_updated_date,
22
+ :detail_url,
23
+ # 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
24
+ :person_id,
25
+ :person_family_name,
26
+ :person_first_name,
27
+ :person_family_name_reading,
28
+ :person_first_name_reading,
29
+ :person_family_name_reading_collation,
30
+ :person_first_name_reading_collation,
31
+ :person_family_name_romaji,
32
+ :person_first_name_romaji,
33
+ :person_type,
34
+ :person_birthday,
35
+ :person_date_of_death,
36
+ :person_copyrighted,
37
+ # 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
38
+ :original_book_name1,
39
+ :original_book_publisher_name1,
40
+ :original_book_first_published_date1,
41
+ :used_version_for_registration1,
42
+ :used_version_for_proofreading1,
43
+ :base_of_original_book_name1,
44
+ :base_of_original_book_publisher_name1,
45
+ :base_of_original_book_first_published_date1,
46
+ # 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
47
+ :original_book_name2,
48
+ :original_book_publisher_name2,
49
+ :original_book_first_published_date2,
50
+ :used_version_for_registration2,
51
+ :used_version_for_proofreading2,
52
+ :base_of_original_book_name2,
53
+ :base_of_original_book_publisher_name2,
54
+ :base_of_original_book_first_published_date2,
55
+ # 入力者,校正者,
56
+ :registered_person_name,
57
+ :proofreader_name,
58
+ # テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
59
+ :text_file_url,
60
+ :last_text_file_updated_date,
61
+ :text_file_character_encoding,
62
+ :text_file_character_set,
63
+ :text_file_updating_count,
64
+ # XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
65
+ :html_file_url,
66
+ :last_html_file_updated_date,
67
+ :html_file_character_encoding,
68
+ :html_file_character_set,
69
+ :html_file_updating_count
70
+ )
71
+
72
+ class Book
73
+ attr_writer :cache_path
74
+
75
+ def initialize(*args)
76
+ super
77
+ @text = nil
78
+ @html = nil
79
+ @cache_path = nil
80
+ end
81
+
82
+ alias_method :copyrighted?, :copyrighted
83
+ alias_method :person_copyrighted?, :person_copyrighted
84
+
85
+ def text
86
+ return @text unless @text.nil?
87
+ return @text if text_file_url.nil? || text_file_url.empty?
88
+
89
+ # when url is not zip file, it needs to open web page by brower and has to download
90
+ # e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
91
+ return @text unless text_file_url.end_with?('.zip')
92
+
93
+ downloader = Downloader.new(text_file_url)
94
+ downloader.download(text_file_output_path)
95
+
96
+ @text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
97
+ input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
98
+ end
99
+
100
+ @text
101
+ end
102
+
103
+ def html
104
+ return @html unless @html.nil?
105
+ return @html if html_file_url.nil? || html_file_url.empty?
106
+
107
+ downloader = Downloader.new(html_file_url)
108
+ downloader.download(html_file_output_path)
109
+ @html = File.read(html_file_output_path).encode(Encoding::UTF_8,
110
+ normalize_encoding(html_file_character_encoding))
111
+
112
+ @html
113
+ end
114
+
115
+ private
116
+
117
+ def text_file_output_path
118
+ cache_base_dir + text_file_name
119
+ end
120
+
121
+ def html_file_output_path
122
+ cache_base_dir + html_file_name
123
+ end
124
+
125
+ def text_file_name
126
+ text_file_url.split('/').last
127
+ end
128
+
129
+ def html_file_name
130
+ html_file_url.split('/').last
131
+ end
132
+
133
+ def cache_base_dir
134
+ @cache_path.base_dir + title_id + person_id
135
+ end
136
+
137
+ def normalize_encoding(encoding)
138
+ case encoding
139
+ when 'ShiftJIS'
140
+ Encoding::Shift_JIS
141
+ when 'UTF-8'
142
+ Encoding::UTF_8
143
+ else
144
+ encoding
145
+ end
146
+ end
147
+ end
148
+
149
+ def initialize
150
+ super()
151
+
152
+ @metadata.id = 'aozora-bunko'
153
+ @metadata.name = 'Aozora Bunko'
154
+ @metadata.url = 'https://www.aozora.gr.jp/'
155
+ @metadata.licenses = 'CC-BY-2.1-JP'
156
+ @metadata.description = <<~DESCRIPTION
157
+ Aozora Bunko is an activity to collect free electronic books that anyone can access
158
+ on the Internet like a library. The copyrighted works and the works that are said to be
159
+ "free to read" are available after being digitized in text and XHTML (some HTML) formats.
160
+ DESCRIPTION
161
+ end
162
+
163
+ def each
164
+ return to_enum(__method__) unless block_given?
165
+
166
+ open_data do |csv_file_stream|
167
+ text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
168
+
169
+ CSV.parse(text, headers: true) do |row|
170
+ %w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
171
+ row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
172
+ end
173
+ book = Book.new(*row.fields)
174
+ book.cache_path = cache_path
175
+
176
+ yield(book)
177
+ end
178
+ end
179
+ end
180
+
181
+ private
182
+
183
+ def open_data(&block)
184
+ data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
185
+ data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
186
+ download(data_path, data_url)
187
+ ZipExtractor.new(data_path).extract_first_file do |input|
188
+ block.call(input)
189
+ end
190
+ end
191
+
192
+ def normalize_boolean(column_value)
193
+ column_value == 'あり'
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,28 @@
1
+ module Datasets
2
+ class CachePath
3
+ def initialize(id)
4
+ @id = id
5
+ end
6
+
7
+ def base_dir
8
+ Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
9
+ end
10
+
11
+ def remove
12
+ FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
13
+ end
14
+
15
+ private
16
+
17
+ def system_cache_dir
18
+ case RUBY_PLATFORM
19
+ when /mswin/, /mingw/
20
+ ENV['LOCALAPPDATA'] || '~/AppData/Local'
21
+ when /darwin/
22
+ '~/Library/Caches'
23
+ else
24
+ ENV['XDG_CACHE_HOME'] || '~/.cache'
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,60 @@
1
+ require "csv"
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ class CaliforniaHousing < Dataset
6
+ Record = Struct.new(:median_house_value,
7
+ :median_income,
8
+ :housing_median_age,
9
+ :total_rooms,
10
+ :total_bedrooms,
11
+ :population,
12
+ :households,
13
+ :latitude,
14
+ :longitude)
15
+
16
+ def initialize
17
+ super()
18
+ @metadata.id = "california-housing"
19
+ @metadata.name = "California Housing"
20
+ @metadata.url = "http://lib.stat.cmu.edu/datasets/"
21
+ @metadata.licenses = ["CCO"]
22
+ @metadata.description = <<-DESCRIPTION
23
+ Housing information from the 1990 census used in
24
+ Pace, R. Kelley and Ronald Barry,
25
+ "Sparse Spatial Autoregressions",
26
+ Statistics and Probability Letters, 33 (1997) 291-297.
27
+ Available from http://lib.stat.cmu.edu/datasets/.
28
+ DESCRIPTION
29
+ end
30
+
31
+ def each
32
+ return to_enum(__method__) unless block_given?
33
+
34
+ data_path = cache_dir_path + "houses.zip"
35
+ data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
36
+ file_name = "cadata.txt"
37
+ download(data_path, data_url)
38
+ open_data(data_path, file_name) do |input|
39
+ data = ""
40
+ input.each_line do |line|
41
+ next unless line.start_with?(" ")
42
+ data << line.lstrip.gsub(/ +/, ",")
43
+ end
44
+ options = {
45
+ converters: [:numeric],
46
+ }
47
+ CSV.parse(data, **options) do |row|
48
+ yield(Record.new(*row))
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+ def open_data(data_path, file_name)
55
+ ZipExtractor.new(data_path).extract_first_file do |input|
56
+ yield input
57
+ end
58
+ end
59
+ end
60
+ end
@@ -50,10 +50,8 @@ module Datasets
50
50
  return to_enum(__method__) unless block_given?
51
51
 
52
52
  data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
53
- unless data_path.exist?
54
- data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
55
- download(data_path, data_url)
56
- end
53
+ data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
54
+ download(data_path, data_url)
57
55
 
58
56
  parse_data(data_path, &block)
59
57
  end
@@ -42,10 +42,8 @@ module Datasets
42
42
  private
43
43
  def open_data
44
44
  data_path = cache_dir_path + "plurals.xml"
45
- unless data_path.exist?
46
- download(data_path, @metadata.url)
47
- end
48
- ::File.open(data_path) do |input|
45
+ download(data_path, @metadata.url)
46
+ data_path.open do |input|
49
47
  yield(input)
50
48
  end
51
49
  end
@@ -140,6 +140,7 @@ module Datasets
140
140
  @metadata.id = "communities"
141
141
  @metadata.name = "Communities"
142
142
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
143
+ @metadata.licenses = ["CC-BY-4.0"]
143
144
  @metadata.description = lambda do
144
145
  read_names
145
146
  end
@@ -177,10 +178,8 @@ module Datasets
177
178
 
178
179
  def open_data
179
180
  data_path = cache_dir_path + "communities.data"
180
- unless data_path.exist?
181
- data_url = "#{base_url}/communities.data"
182
- download(data_path, data_url)
183
- end
181
+ data_url = "#{base_url}/communities.data"
182
+ download(data_path, data_url)
184
183
  CSV.open(data_path) do |csv|
185
184
  yield(csv)
186
185
  end
@@ -188,10 +187,8 @@ module Datasets
188
187
 
189
188
  def read_names
190
189
  names_path = cache_dir_path + "communities.names"
191
- unless names_path.exist?
192
- names_url = "#{base_url}/communities.names"
193
- download(names_path, names_url)
194
- end
190
+ names_url = "#{base_url}/communities.names"
191
+ download(names_path, names_url)
195
192
  names_path.read
196
193
  end
197
194
  end
@@ -1,5 +1,6 @@
1
1
  require "pathname"
2
2
 
3
+ require_relative "cache-path"
3
4
  require_relative "downloader"
4
5
  require_relative "error"
5
6
  require_relative "metadata"
@@ -19,22 +20,17 @@ module Datasets
19
20
  end
20
21
 
21
22
  def clear_cache!
22
- if cache_dir_path.exist?
23
- FileUtils.rmtree(cache_dir_path.to_s, secure: true)
24
- end
23
+ cache_path.remove
25
24
  end
26
25
 
27
26
  private
27
+
28
28
  def cache_dir_path
29
- case RUBY_PLATFORM
30
- when /mswin/, /mingw/
31
- base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
32
- when /darwin/
33
- base_dir = "~/Library/Caches"
34
- else
35
- base_dir = ENV["XDG_CACHE_HOME"] || "~/.cache"
36
- end
37
- Pathname(base_dir).expand_path + "red-datasets" + metadata.id
29
+ cache_path.base_dir
30
+ end
31
+
32
+ def cache_path
33
+ @cache_path ||= CachePath.new(@metadata.id)
38
34
  end
39
35
 
40
36
  def download(output_path, url)
@@ -0,0 +1,26 @@
1
+ require_relative "ggplot2-dataset"
2
+
3
+ module Datasets
4
+ class Diamonds < Ggplot2Dataset
5
+ Record = Struct.new(:carat,
6
+ :cut,
7
+ :color,
8
+ :clarity,
9
+ :depth,
10
+ :table,
11
+ :price,
12
+ :x,
13
+ :y,
14
+ :z)
15
+
16
+ def initialize()
17
+ super("diamonds")
18
+ @metadata.id = "diamonds"
19
+ @metadata.name = "Diamonds"
20
+ @metadata.licenses = ["CC0-1.0"]
21
+ end
22
+
23
+ COLUMN_NAME_MAPPING = {
24
+ }
25
+ end
26
+ end
@@ -23,9 +23,14 @@ module Datasets
23
23
  end
24
24
 
25
25
  def download(output_path)
26
+ return if output_path.exist?
27
+
26
28
  output_path.parent.mkpath
27
29
 
28
- headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
30
+ headers = {
31
+ "Accept-Encoding" => "identity",
32
+ "User-Agent" => "Red Datasets/#{VERSION}",
33
+ }
29
34
  start = nil
30
35
  partial_output_path = Pathname.new("#{output_path}.partial")
31
36
  if partial_output_path.exist?
@@ -74,6 +74,7 @@ module Datasets
74
74
  @metadata.id = "e-stat-japan-#{@api_version}"
75
75
  @metadata.name = "e-Stat API #{@api_version}"
76
76
  @metadata.url = @base_url
77
+ @metadata.licenses = ["CC-BY-4.0"]
77
78
  @metadata.description = "e-Stat API #{@api_version}"
78
79
 
79
80
  @id = id
@@ -214,7 +215,7 @@ module Datasets
214
215
  # even if error happens dispite of its error mapping.
215
216
  # So we can't avoid caching retrieved response from the api.
216
217
  # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
217
- download(@data_path, @url.to_s) unless @data_path.exist?
218
+ download(@data_path, @url.to_s)
218
219
  end
219
220
 
220
221
  def index_data
@@ -8,5 +8,9 @@ module Datasets
8
8
  def dataset_name
9
9
  "Fashion-MNIST"
10
10
  end
11
+
12
+ def licenses
13
+ ["MIT"]
14
+ end
11
15
  end
12
16
  end