red-datasets 0.1.4 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -1,5 +1,6 @@
1
1
  require "pathname"
2
2
 
3
+ require_relative "cache-path"
3
4
  require_relative "downloader"
4
5
  require_relative "error"
5
6
  require_relative "metadata"
@@ -19,38 +20,72 @@ module Datasets
19
20
  end
20
21
 
21
22
  def clear_cache!
22
- if cache_dir_path.exist?
23
- FileUtils.rmtree(cache_dir_path.to_s, secure: true)
24
- end
23
+ cache_path.remove
25
24
  end
26
25
 
27
26
  private
27
+
28
28
  def cache_dir_path
29
- case RUBY_PLATFORM
30
- when /mswin/, /mingw/
31
- base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
32
- when /darwin/
33
- base_dir = "~/Library/Caches"
34
- else
35
- base_dir = ENV["XDG_CACHE_HOME"] || "~/.cache"
36
- end
37
- Pathname(base_dir).expand_path + "red-datasets" + metadata.id
29
+ cache_path.base_dir
38
30
  end
39
31
 
40
- def download(output_path, url)
32
+ def cache_path
33
+ @cache_path ||= CachePath.new(@metadata.id)
34
+ end
35
+
36
+ def download(output_path, url, &block)
41
37
  downloader = Downloader.new(url)
42
- downloader.download(output_path)
38
+ downloader.download(output_path, &block)
43
39
  end
44
40
 
45
- def extract_bz2(path)
46
- input, output = IO.pipe
47
- pid = spawn("bzcat", path.to_s, {:out => output})
48
- begin
49
- output.close
50
- yield(input)
51
- ensure
52
- input.close
53
- Process.waitpid(pid)
41
+ def extract_bz2(bz2)
42
+ case bz2
43
+ when Pathname, String
44
+ IO.pipe do |input, output|
45
+ pid = spawn("bzcat", bz2.to_s, {out: output})
46
+ begin
47
+ output.close
48
+ yield(input)
49
+ ensure
50
+ input.close
51
+ Process.waitpid(pid)
52
+ end
53
+ end
54
+ else
55
+ IO.pipe do |bz2_input, bz2_output|
56
+ IO.pipe do |plain_input, plain_output|
57
+ bz2_stop = false
58
+ bz2_thread = Thread.new do
59
+ begin
60
+ bz2.each do |chunk|
61
+ bz2_output.write(chunk)
62
+ bz2_output.flush
63
+ break if bz2_stop
64
+ end
65
+ rescue => error
66
+ message = "Failed to read bzcat input: " +
67
+ "#{error.class}: #{error.message}"
68
+ $stderr.puts(message)
69
+ ensure
70
+ bz2_output.close
71
+ end
72
+ end
73
+ begin
74
+ pid = spawn("bzcat", {in: bz2_input, out: plain_output})
75
+ begin
76
+ bz2_input.close
77
+ plain_output.close
78
+ yield(plain_input)
79
+ ensure
80
+ plain_input.close
81
+ Process.waitpid(pid)
82
+ end
83
+ ensure
84
+ bz2_stop = true
85
+ bz2_thread.join
86
+ end
87
+ end
88
+ end
54
89
  end
55
90
  end
56
91
  end
@@ -0,0 +1,26 @@
1
+ require_relative "ggplot2-dataset"
2
+
3
+ module Datasets
4
+ class Diamonds < Ggplot2Dataset
5
+ Record = Struct.new(:carat,
6
+ :cut,
7
+ :color,
8
+ :clarity,
9
+ :depth,
10
+ :table,
11
+ :price,
12
+ :x,
13
+ :y,
14
+ :z)
15
+
16
+ def initialize()
17
+ super("diamonds")
18
+ @metadata.id = "diamonds"
19
+ @metadata.name = "Diamonds"
20
+ @metadata.licenses = ["CC0-1.0"]
21
+ end
22
+
23
+ COLUMN_NAME_MAPPING = {
24
+ }
25
+ end
26
+ end
@@ -22,45 +22,115 @@ module Datasets
22
22
  end
23
23
  end
24
24
 
25
- def download(output_path)
26
- output_path.parent.mkpath
25
+ def download(output_path, &block)
26
+ if output_path.exist?
27
+ yield_chunks(output_path, &block) if block_given?
28
+ return
29
+ end
27
30
 
28
- headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
29
- start = nil
30
31
  partial_output_path = Pathname.new("#{output_path}.partial")
31
- if partial_output_path.exist?
32
- start = partial_output_path.size
33
- headers["Range"] = "bytes=#{start}-"
34
- end
32
+ synchronize(output_path, partial_output_path) do
33
+ output_path.parent.mkpath
35
34
 
36
- start_http(@url, headers) do |response|
37
- if response.is_a?(Net::HTTPPartialContent)
38
- mode = "ab"
39
- else
35
+ n_retries = 0
36
+ n_max_retries = 5
37
+ begin
38
+ headers = {
39
+ "Accept-Encoding" => "identity",
40
+ "User-Agent" => "Red Datasets/#{VERSION}",
41
+ }
40
42
  start = nil
41
- mode = "wb"
42
- end
43
+ if partial_output_path.exist?
44
+ start = partial_output_path.size
45
+ headers["Range"] = "bytes=#{start}-"
46
+ end
47
+
48
+ start_http(@url, headers) do |response|
49
+ if response.is_a?(Net::HTTPPartialContent)
50
+ mode = "ab"
51
+ else
52
+ start = nil
53
+ mode = "wb"
54
+ end
43
55
 
44
- base_name = @url.path.split("/").last
45
- size_current = 0
46
- size_max = response.content_length
47
- if start
48
- size_current += start
49
- size_max += start
56
+ base_name = @url.path.split("/").last
57
+ size_current = 0
58
+ size_max = response.content_length
59
+ if start
60
+ size_current += start
61
+ size_max += start
62
+ if block_given? and n_retries.zero?
63
+ yield_chunks(partial_output_path, &block)
64
+ end
65
+ end
66
+ progress_reporter = ProgressReporter.new(base_name, size_max)
67
+ partial_output_path.open(mode) do |output|
68
+ response.read_body do |chunk|
69
+ size_current += chunk.bytesize
70
+ progress_reporter.report(size_current)
71
+ output.write(chunk)
72
+ yield(chunk) if block_given?
73
+ end
74
+ end
75
+ end
76
+ FileUtils.mv(partial_output_path, output_path)
77
+ rescue Net::ReadTimeout => error
78
+ n_retries += 1
79
+ retry if n_retries < n_max_retries
80
+ raise
81
+ rescue TooManyRedirects => error
82
+ last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
83
+ raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
50
84
  end
51
- progress_reporter = ProgressReporter.new(base_name, size_max)
52
- partial_output_path.open(mode) do |output|
53
- response.read_body do |chunk|
54
- size_current += chunk.bytesize
55
- progress_reporter.report(size_current)
56
- output.write(chunk)
85
+ end
86
+ end
87
+
88
+ private def synchronize(output_path, partial_output_path)
89
+ begin
90
+ Process.getpgid(Process.pid)
91
+ rescue NotImplementedError
92
+ return yield
93
+ end
94
+
95
+ lock_path = Pathname("#{output_path}.lock")
96
+ loop do
97
+ lock_path.parent.mkpath
98
+ begin
99
+ lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
100
+ rescue SystemCallError
101
+ valid_lock_path = true
102
+ begin
103
+ pid = Integer(lock_path.read.chomp, 10)
104
+ rescue ArgumentError
105
+ # The process that acquired the lock will be exited before
106
+ # it stores its process ID.
107
+ valid_lock_path = (lock_path.mtime > 10)
108
+ else
109
+ begin
110
+ Process.getpgid(pid)
111
+ rescue SystemCallError
112
+ # Process that acquired the lock doesn't exist
113
+ valid_lock_path = false
114
+ end
115
+ end
116
+ if valid_lock_path
117
+ sleep(1 + rand(10))
118
+ else
119
+ lock_path.delete
57
120
  end
121
+ retry
122
+ else
123
+ begin
124
+ lock.puts(Process.pid.to_s)
125
+ lock.flush
126
+ yield
127
+ ensure
128
+ lock.close
129
+ lock_path.delete
130
+ end
131
+ break
58
132
  end
59
133
  end
60
- FileUtils.mv(partial_output_path, output_path)
61
- rescue TooManyRedirects => error
62
- last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
63
- raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
64
134
  end
65
135
 
66
136
  private def start_http(url, headers, limit = 10, &block)
@@ -94,6 +164,16 @@ module Datasets
94
164
  end
95
165
  end
96
166
 
167
+ private def yield_chunks(path)
168
+ path.open("rb") do |output|
169
+ chunk_size = 1024 * 1024
170
+ chunk = ""
171
+ while output.read(chunk_size, chunk)
172
+ yield(chunk)
173
+ end
174
+ end
175
+ end
176
+
97
177
  class ProgressReporter
98
178
  def initialize(base_name, size_max)
99
179
  @base_name = base_name
@@ -74,6 +74,7 @@ module Datasets
74
74
  @metadata.id = "e-stat-japan-#{@api_version}"
75
75
  @metadata.name = "e-Stat API #{@api_version}"
76
76
  @metadata.url = @base_url
77
+ @metadata.licenses = ["CC-BY-4.0"]
77
78
  @metadata.description = "e-Stat API #{@api_version}"
78
79
 
79
80
  @id = id
@@ -214,7 +215,7 @@ module Datasets
214
215
  # even if error happens dispite of its error mapping.
215
216
  # So we can't avoid caching retrieved response from the api.
216
217
  # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
217
- download(@data_path, @url.to_s) unless @data_path.exist?
218
+ download(@data_path, @url.to_s)
218
219
  end
219
220
 
220
221
  def index_data
@@ -8,5 +8,9 @@ module Datasets
8
8
  def dataset_name
9
9
  "Fashion-MNIST"
10
10
  end
11
+
12
+ def licenses
13
+ ["MIT"]
14
+ end
11
15
  end
12
16
  end
@@ -0,0 +1,35 @@
1
+ require_relative "ggplot2-dataset"
2
+
3
+ module Datasets
4
+ class FuelEconomy < Ggplot2Dataset
5
+ Record = Struct.new(:manufacturer,
6
+ :model,
7
+ :displacement,
8
+ :year,
9
+ :n_cylinders,
10
+ :transmission,
11
+ :drive_train,
12
+ :city_mpg,
13
+ :highway_mpg,
14
+ :fuel,
15
+ :type)
16
+
17
+ def initialize
18
+ super("mpg")
19
+ @metadata.id = "fuel-economy"
20
+ @metadata.name = "Fuel economy"
21
+ @metadata.licenses = ["CC0-1.0"]
22
+ end
23
+
24
+ COLUMN_NAME_MAPPING = {
25
+ "displ" => "displacement",
26
+ "cyl" => "n_cylinders",
27
+ "trans" => "transmissions",
28
+ "drv" => "drive_train",
29
+ "cty" => "city_mpg",
30
+ "hwy" => "highway_mpg",
31
+ "fl" => "fuel",
32
+ "class" => "type",
33
+ }
34
+ end
35
+ end
@@ -0,0 +1,67 @@
1
+ require 'csv'
2
+
3
+ require_relative 'dataset'
4
+
5
+ module Datasets
6
+ class Geolonia < Dataset
7
+ Record = Struct.new(:prefecture_code,
8
+ :prefecture_name,
9
+ :prefecture_kana,
10
+ :prefecture_romaji,
11
+ :municipality_code,
12
+ :municipality_name,
13
+ :municipality_kana,
14
+ :municipality_romaji,
15
+ :street_name,
16
+ :street_kana,
17
+ :street_romaji,
18
+ :alias,
19
+ :latitude,
20
+ :longitude)
21
+
22
+ def initialize
23
+ super
24
+ @metadata.id = 'geolonia'
25
+ @metadata.name = 'Geolonia'
26
+ @metadata.url = 'https://github.com/geolonia/japanese-addresses'
27
+ @metadata.licenses = ["CC-BY-4.0"]
28
+ @metadata.description = lambda do
29
+ fetch_readme
30
+ end
31
+ end
32
+
33
+ def each
34
+ return to_enum(__method__) unless block_given?
35
+
36
+ open_data do |csv|
37
+ csv.readline
38
+ csv.each do |row|
39
+ record = Record.new(*row)
40
+ yield(record)
41
+ end
42
+ end
43
+ end
44
+
45
+ private
46
+ def download_base_url
47
+ "https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
48
+ end
49
+
50
+ def open_data
51
+ data_path = cache_dir_path + 'latest.csv'
52
+ data_url = "#{download_base_url}/data/latest.csv"
53
+ download(data_path, data_url)
54
+ CSV.open(data_path) do |csv|
55
+ yield(csv)
56
+ end
57
+ end
58
+
59
+ def fetch_readme
60
+ readme_base_name = "README.md"
61
+ readme_path = cache_dir_path + readme_base_name
62
+ readme_url = "#{download_base_url}/#{readme_base_name}"
63
+ download(readme_path, readme_url)
64
+ readme_path.read.split(/^## API/, 2)[0].strip
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,79 @@
1
+ module Datasets
2
+ class Ggplot2Dataset < Dataset
3
+ def initialize(ggplot2_dataset_name)
4
+ super()
5
+ @ggplot2_dataset_name = ggplot2_dataset_name
6
+ @metadata.url =
7
+ "https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
8
+ @metadata.description = lambda do
9
+ fetch_description
10
+ end
11
+ end
12
+
13
+ def each
14
+ return to_enum(__method__) unless block_given?
15
+
16
+ data_base_name = "#{@ggplot2_dataset_name}.csv"
17
+ data_path = cache_dir_path + data_base_name
18
+ data_url = "#{download_base_url}/data-raw/#{data_base_name}"
19
+ download(data_path, data_url)
20
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
21
+ record_class = self.class::Record
22
+ csv.each do |row|
23
+ record = record_class.new(*row.fields)
24
+ yield record
25
+ end
26
+ end
27
+ end
28
+
29
+ private
30
+ def download_base_url
31
+ "https://raw.githubusercontent.com/tidyverse/ggplot2/main"
32
+ end
33
+
34
+ def fetch_description
35
+ data_r_base_name = "data.R"
36
+ data_r_path = cache_dir_path + data_r_base_name
37
+ data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
38
+ download(data_r_path, data_r_url)
39
+ descriptions = {}
40
+ comment = ""
41
+ File.open(data_r_path) do |data_r|
42
+ data_r.each_line do |line|
43
+ case line.chomp
44
+ when /\A#'/
45
+ comment_content = Regexp.last_match.post_match
46
+ unless comment_content.empty?
47
+ comment_content = comment_content[1..-1]
48
+ end
49
+ comment << comment_content
50
+ comment << "\n"
51
+ when /\A"(.+)"\z/
52
+ name = Regexp.last_match[1]
53
+ descriptions[name] = parse_roxygen(comment.rstrip)
54
+ comment = ""
55
+ end
56
+ end
57
+ descriptions[@ggplot2_dataset_name]
58
+ end
59
+ end
60
+
61
+ def parse_roxygen(roxygen)
62
+ column_name_mapping = self.class::COLUMN_NAME_MAPPING
63
+ roxygen
64
+ .gsub(/\\url\{(.*?)\}/, "\\1")
65
+ .gsub(/^@format /, "")
66
+ .gsub(/\\describe\{(.*)\}/m) do
67
+ content = $1
68
+ content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
69
+ column_name = $1
70
+ description = $2
71
+ column_name = column_name_mapping[column_name] || column_name
72
+ description = description
73
+ .gsub(/\\\$/, "$")
74
+ "* #{column_name}: #{description}"
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -163,6 +163,7 @@ module Datasets
163
163
  @metadata.id = "hepatitis"
164
164
  @metadata.name = "Hepatitis"
165
165
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
166
+ @metadata.licenses = ["CC-BY-4.0"]
166
167
  @metadata.description = lambda do
167
168
  read_names
168
169
  end
@@ -186,10 +187,8 @@ module Datasets
186
187
 
187
188
  def open_data
188
189
  data_path = cache_dir_path + "hepatitis.csv"
189
- unless data_path.exist?
190
- data_url = "#{base_url}/hepatitis.data"
191
- download(data_path, data_url)
192
- end
190
+ data_url = "#{base_url}/hepatitis.data"
191
+ download(data_path, data_url)
193
192
  CSV.open(data_path) do |csv|
194
193
  yield(csv)
195
194
  end
@@ -197,10 +196,8 @@ module Datasets
197
196
 
198
197
  def read_names
199
198
  names_path = cache_dir_path + "hepatitis.names"
200
- unless names_path.exist?
201
- names_url = "#{base_url}/hepatitis.names"
202
- download(names_path, names_url)
203
- end
199
+ names_url = "#{base_url}/hepatitis.names"
200
+ download(names_path, names_url)
204
201
  names_path.read
205
202
  end
206
203
  end
data/lib/datasets/iris.rb CHANGED
@@ -15,6 +15,7 @@ module Datasets
15
15
  @metadata.id = "iris"
16
16
  @metadata.name = "Iris"
17
17
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
18
+ @metadata.licenses = ["CC-BY-4.0"]
18
19
  @metadata.description = lambda do
19
20
  read_names
20
21
  end
@@ -35,10 +36,8 @@ module Datasets
35
36
  private
36
37
  def open_data
37
38
  data_path = cache_dir_path + "iris.csv"
38
- unless data_path.exist?
39
- data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40
- download(data_path, data_url)
41
- end
39
+ data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40
+ download(data_path, data_url)
42
41
  CSV.open(data_path, converters: [:numeric]) do |csv|
43
42
  yield(csv)
44
43
  end
@@ -46,10 +45,8 @@ module Datasets
46
45
 
47
46
  def read_names
48
47
  names_path = cache_dir_path + "iris.names"
49
- unless names_path.exist?
50
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
51
- download(names_path, names_url)
52
- end
48
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
49
+ download(names_path, names_url)
53
50
  names_path.read
54
51
  end
55
52
  end
@@ -0,0 +1,57 @@
1
+ require_relative 'dataset'
2
+
3
+ module Datasets
4
+ class ITACorpus < Dataset
5
+ Record = Struct.new(:id,
6
+ :sentence)
7
+
8
+ def initialize(type: :emotion)
9
+ unless [:emotion, :recitation].include?(type)
10
+ raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
11
+ end
12
+
13
+ super()
14
+ @type = type
15
+ @metadata.id = 'ita-corpus'
16
+ @metadata.name = 'ITA-corpus'
17
+ @metadata.url = 'https://github.com/mmorise/ita-corpus'
18
+ @metadata.licenses = ['Unlicense']
19
+ @metadata.description = lambda do
20
+ fetch_readme
21
+ end
22
+ end
23
+
24
+ def each(&block)
25
+ return to_enum(__method__) unless block_given?
26
+
27
+ data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
28
+ data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
29
+ download(data_path, data_url)
30
+
31
+ parse_data(data_path, &block)
32
+ end
33
+
34
+ private
35
+ def fetch_readme
36
+ readme_base_name = "README.md"
37
+ readme_path = cache_dir_path + readme_base_name
38
+ readme_url = "#{download_base_url}/#{readme_base_name}"
39
+ download(readme_path, readme_url)
40
+ readme_path.read.split(/^## ファイル構成/, 2)[0].strip
41
+ end
42
+
43
+ def download_base_url
44
+ "https://raw.githubusercontent.com/mmorise/ita-corpus/main"
45
+ end
46
+
47
+ def parse_data(data_path)
48
+ File.open(data_path) do |f|
49
+ f.each_line(chomp: true) do |line|
50
+ id, sentence = line.split(':', 2)
51
+ record = Record.new(id , sentence)
52
+ yield(record)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,16 @@
1
+ require_relative 'mnist'
2
+
3
+ module Datasets
4
+ class KuzushijiMNIST < MNIST
5
+ BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
6
+
7
+ private
8
+ def dataset_name
9
+ "Kuzushiji-MNIST"
10
+ end
11
+
12
+ def licenses
13
+ ["CC-BY-SA-4.0"]
14
+ end
15
+ end
16
+ end