red-datasets 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -1,5 +1,6 @@
1
1
  require "pathname"
2
2
 
3
+ require_relative "cache-path"
3
4
  require_relative "downloader"
4
5
  require_relative "error"
5
6
  require_relative "metadata"
@@ -19,38 +20,72 @@ module Datasets
19
20
  end
20
21
 
21
22
  def clear_cache!
22
- if cache_dir_path.exist?
23
- FileUtils.rmtree(cache_dir_path.to_s, secure: true)
24
- end
23
+ cache_path.remove
25
24
  end
26
25
 
27
26
  private
27
+
28
28
  def cache_dir_path
29
- case RUBY_PLATFORM
30
- when /mswin/, /mingw/
31
- base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
32
- when /darwin/
33
- base_dir = "~/Library/Caches"
34
- else
35
- base_dir = ENV["XDG_CACHE_HOME"] || "~/.cache"
36
- end
37
- Pathname(base_dir).expand_path + "red-datasets" + metadata.id
29
+ cache_path.base_dir
38
30
  end
39
31
 
40
- def download(output_path, url)
32
+ def cache_path
33
+ @cache_path ||= CachePath.new(@metadata.id)
34
+ end
35
+
36
+ def download(output_path, url, &block)
41
37
  downloader = Downloader.new(url)
42
- downloader.download(output_path)
38
+ downloader.download(output_path, &block)
43
39
  end
44
40
 
45
- def extract_bz2(path)
46
- input, output = IO.pipe
47
- pid = spawn("bzcat", path.to_s, {:out => output})
48
- begin
49
- output.close
50
- yield(input)
51
- ensure
52
- input.close
53
- Process.waitpid(pid)
41
+ def extract_bz2(bz2)
42
+ case bz2
43
+ when Pathname, String
44
+ IO.pipe do |input, output|
45
+ pid = spawn("bzcat", bz2.to_s, {out: output})
46
+ begin
47
+ output.close
48
+ yield(input)
49
+ ensure
50
+ input.close
51
+ Process.waitpid(pid)
52
+ end
53
+ end
54
+ else
55
+ IO.pipe do |bz2_input, bz2_output|
56
+ IO.pipe do |plain_input, plain_output|
57
+ bz2_stop = false
58
+ bz2_thread = Thread.new do
59
+ begin
60
+ bz2.each do |chunk|
61
+ bz2_output.write(chunk)
62
+ bz2_output.flush
63
+ break if bz2_stop
64
+ end
65
+ rescue => error
66
+ message = "Failed to read bzcat input: " +
67
+ "#{error.class}: #{error.message}"
68
+ $stderr.puts(message)
69
+ ensure
70
+ bz2_output.close
71
+ end
72
+ end
73
+ begin
74
+ pid = spawn("bzcat", {in: bz2_input, out: plain_output})
75
+ begin
76
+ bz2_input.close
77
+ plain_output.close
78
+ yield(plain_input)
79
+ ensure
80
+ plain_input.close
81
+ Process.waitpid(pid)
82
+ end
83
+ ensure
84
+ bz2_stop = true
85
+ bz2_thread.join
86
+ end
87
+ end
88
+ end
54
89
  end
55
90
  end
56
91
  end
@@ -0,0 +1,26 @@
1
+ require_relative "ggplot2-dataset"
2
+
3
+ module Datasets
4
+ class Diamonds < Ggplot2Dataset
5
+ Record = Struct.new(:carat,
6
+ :cut,
7
+ :color,
8
+ :clarity,
9
+ :depth,
10
+ :table,
11
+ :price,
12
+ :x,
13
+ :y,
14
+ :z)
15
+
16
+ def initialize()
17
+ super("diamonds")
18
+ @metadata.id = "diamonds"
19
+ @metadata.name = "Diamonds"
20
+ @metadata.licenses = ["CC0-1.0"]
21
+ end
22
+
23
+ COLUMN_NAME_MAPPING = {
24
+ }
25
+ end
26
+ end
@@ -22,45 +22,115 @@ module Datasets
22
22
  end
23
23
  end
24
24
 
25
- def download(output_path)
26
- output_path.parent.mkpath
25
+ def download(output_path, &block)
26
+ if output_path.exist?
27
+ yield_chunks(output_path, &block) if block_given?
28
+ return
29
+ end
27
30
 
28
- headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
29
- start = nil
30
31
  partial_output_path = Pathname.new("#{output_path}.partial")
31
- if partial_output_path.exist?
32
- start = partial_output_path.size
33
- headers["Range"] = "bytes=#{start}-"
34
- end
32
+ synchronize(output_path, partial_output_path) do
33
+ output_path.parent.mkpath
35
34
 
36
- start_http(@url, headers) do |response|
37
- if response.is_a?(Net::HTTPPartialContent)
38
- mode = "ab"
39
- else
35
+ n_retries = 0
36
+ n_max_retries = 5
37
+ begin
38
+ headers = {
39
+ "Accept-Encoding" => "identity",
40
+ "User-Agent" => "Red Datasets/#{VERSION}",
41
+ }
40
42
  start = nil
41
- mode = "wb"
42
- end
43
+ if partial_output_path.exist?
44
+ start = partial_output_path.size
45
+ headers["Range"] = "bytes=#{start}-"
46
+ end
47
+
48
+ start_http(@url, headers) do |response|
49
+ if response.is_a?(Net::HTTPPartialContent)
50
+ mode = "ab"
51
+ else
52
+ start = nil
53
+ mode = "wb"
54
+ end
43
55
 
44
- base_name = @url.path.split("/").last
45
- size_current = 0
46
- size_max = response.content_length
47
- if start
48
- size_current += start
49
- size_max += start
56
+ base_name = @url.path.split("/").last
57
+ size_current = 0
58
+ size_max = response.content_length
59
+ if start
60
+ size_current += start
61
+ size_max += start
62
+ if block_given? and n_retries.zero?
63
+ yield_chunks(partial_output_path, &block)
64
+ end
65
+ end
66
+ progress_reporter = ProgressReporter.new(base_name, size_max)
67
+ partial_output_path.open(mode) do |output|
68
+ response.read_body do |chunk|
69
+ size_current += chunk.bytesize
70
+ progress_reporter.report(size_current)
71
+ output.write(chunk)
72
+ yield(chunk) if block_given?
73
+ end
74
+ end
75
+ end
76
+ FileUtils.mv(partial_output_path, output_path)
77
+ rescue Net::ReadTimeout => error
78
+ n_retries += 1
79
+ retry if n_retries < n_max_retries
80
+ raise
81
+ rescue TooManyRedirects => error
82
+ last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
83
+ raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
50
84
  end
51
- progress_reporter = ProgressReporter.new(base_name, size_max)
52
- partial_output_path.open(mode) do |output|
53
- response.read_body do |chunk|
54
- size_current += chunk.bytesize
55
- progress_reporter.report(size_current)
56
- output.write(chunk)
85
+ end
86
+ end
87
+
88
+ private def synchronize(output_path, partial_output_path)
89
+ begin
90
+ Process.getpgid(Process.pid)
91
+ rescue NotImplementedError
92
+ return yield
93
+ end
94
+
95
+ lock_path = Pathname("#{output_path}.lock")
96
+ loop do
97
+ lock_path.parent.mkpath
98
+ begin
99
+ lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
100
+ rescue SystemCallError
101
+ valid_lock_path = true
102
+ begin
103
+ pid = Integer(lock_path.read.chomp, 10)
104
+ rescue ArgumentError
105
+ # The process that acquired the lock will be exited before
106
+ # it stores its process ID.
107
+ valid_lock_path = (lock_path.mtime > 10)
108
+ else
109
+ begin
110
+ Process.getpgid(pid)
111
+ rescue SystemCallError
112
+ # Process that acquired the lock doesn't exist
113
+ valid_lock_path = false
114
+ end
115
+ end
116
+ if valid_lock_path
117
+ sleep(1 + rand(10))
118
+ else
119
+ lock_path.delete
57
120
  end
121
+ retry
122
+ else
123
+ begin
124
+ lock.puts(Process.pid.to_s)
125
+ lock.flush
126
+ yield
127
+ ensure
128
+ lock.close
129
+ lock_path.delete
130
+ end
131
+ break
58
132
  end
59
133
  end
60
- FileUtils.mv(partial_output_path, output_path)
61
- rescue TooManyRedirects => error
62
- last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
63
- raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
64
134
  end
65
135
 
66
136
  private def start_http(url, headers, limit = 10, &block)
@@ -94,6 +164,16 @@ module Datasets
94
164
  end
95
165
  end
96
166
 
167
+ private def yield_chunks(path)
168
+ path.open("rb") do |output|
169
+ chunk_size = 1024 * 1024
170
+ chunk = ""
171
+ while output.read(chunk_size, chunk)
172
+ yield(chunk)
173
+ end
174
+ end
175
+ end
176
+
97
177
  class ProgressReporter
98
178
  def initialize(base_name, size_max)
99
179
  @base_name = base_name
@@ -74,6 +74,7 @@ module Datasets
74
74
  @metadata.id = "e-stat-japan-#{@api_version}"
75
75
  @metadata.name = "e-Stat API #{@api_version}"
76
76
  @metadata.url = @base_url
77
+ @metadata.licenses = ["CC-BY-4.0"]
77
78
  @metadata.description = "e-Stat API #{@api_version}"
78
79
 
79
80
  @id = id
@@ -214,7 +215,7 @@ module Datasets
214
215
  # even if error happens dispite of its error mapping.
215
216
  # So we can't avoid caching retrieved response from the api.
216
217
  # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
217
- download(@data_path, @url.to_s) unless @data_path.exist?
218
+ download(@data_path, @url.to_s)
218
219
  end
219
220
 
220
221
  def index_data
@@ -8,5 +8,9 @@ module Datasets
8
8
  def dataset_name
9
9
  "Fashion-MNIST"
10
10
  end
11
+
12
+ def licenses
13
+ ["MIT"]
14
+ end
11
15
  end
12
16
  end
@@ -0,0 +1,35 @@
1
+ require_relative "ggplot2-dataset"
2
+
3
+ module Datasets
4
+ class FuelEconomy < Ggplot2Dataset
5
+ Record = Struct.new(:manufacturer,
6
+ :model,
7
+ :displacement,
8
+ :year,
9
+ :n_cylinders,
10
+ :transmission,
11
+ :drive_train,
12
+ :city_mpg,
13
+ :highway_mpg,
14
+ :fuel,
15
+ :type)
16
+
17
+ def initialize
18
+ super("mpg")
19
+ @metadata.id = "fuel-economy"
20
+ @metadata.name = "Fuel economy"
21
+ @metadata.licenses = ["CC0-1.0"]
22
+ end
23
+
24
+ COLUMN_NAME_MAPPING = {
25
+ "displ" => "displacement",
26
+ "cyl" => "n_cylinders",
27
+ "trans" => "transmissions",
28
+ "drv" => "drive_train",
29
+ "cty" => "city_mpg",
30
+ "hwy" => "highway_mpg",
31
+ "fl" => "fuel",
32
+ "class" => "type",
33
+ }
34
+ end
35
+ end
@@ -0,0 +1,67 @@
1
+ require 'csv'
2
+
3
+ require_relative 'dataset'
4
+
5
+ module Datasets
6
+ class Geolonia < Dataset
7
+ Record = Struct.new(:prefecture_code,
8
+ :prefecture_name,
9
+ :prefecture_kana,
10
+ :prefecture_romaji,
11
+ :municipality_code,
12
+ :municipality_name,
13
+ :municipality_kana,
14
+ :municipality_romaji,
15
+ :street_name,
16
+ :street_kana,
17
+ :street_romaji,
18
+ :alias,
19
+ :latitude,
20
+ :longitude)
21
+
22
+ def initialize
23
+ super
24
+ @metadata.id = 'geolonia'
25
+ @metadata.name = 'Geolonia'
26
+ @metadata.url = 'https://github.com/geolonia/japanese-addresses'
27
+ @metadata.licenses = ["CC-BY-4.0"]
28
+ @metadata.description = lambda do
29
+ fetch_readme
30
+ end
31
+ end
32
+
33
+ def each
34
+ return to_enum(__method__) unless block_given?
35
+
36
+ open_data do |csv|
37
+ csv.readline
38
+ csv.each do |row|
39
+ record = Record.new(*row)
40
+ yield(record)
41
+ end
42
+ end
43
+ end
44
+
45
+ private
46
+ def download_base_url
47
+ "https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
48
+ end
49
+
50
+ def open_data
51
+ data_path = cache_dir_path + 'latest.csv'
52
+ data_url = "#{download_base_url}/data/latest.csv"
53
+ download(data_path, data_url)
54
+ CSV.open(data_path) do |csv|
55
+ yield(csv)
56
+ end
57
+ end
58
+
59
+ def fetch_readme
60
+ readme_base_name = "README.md"
61
+ readme_path = cache_dir_path + readme_base_name
62
+ readme_url = "#{download_base_url}/#{readme_base_name}"
63
+ download(readme_path, readme_url)
64
+ readme_path.read.split(/^## API/, 2)[0].strip
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,79 @@
1
+ module Datasets
2
+ class Ggplot2Dataset < Dataset
3
+ def initialize(ggplot2_dataset_name)
4
+ super()
5
+ @ggplot2_dataset_name = ggplot2_dataset_name
6
+ @metadata.url =
7
+ "https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
8
+ @metadata.description = lambda do
9
+ fetch_description
10
+ end
11
+ end
12
+
13
+ def each
14
+ return to_enum(__method__) unless block_given?
15
+
16
+ data_base_name = "#{@ggplot2_dataset_name}.csv"
17
+ data_path = cache_dir_path + data_base_name
18
+ data_url = "#{download_base_url}/data-raw/#{data_base_name}"
19
+ download(data_path, data_url)
20
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
21
+ record_class = self.class::Record
22
+ csv.each do |row|
23
+ record = record_class.new(*row.fields)
24
+ yield record
25
+ end
26
+ end
27
+ end
28
+
29
+ private
30
+ def download_base_url
31
+ "https://raw.githubusercontent.com/tidyverse/ggplot2/main"
32
+ end
33
+
34
+ def fetch_description
35
+ data_r_base_name = "data.R"
36
+ data_r_path = cache_dir_path + data_r_base_name
37
+ data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
38
+ download(data_r_path, data_r_url)
39
+ descriptions = {}
40
+ comment = ""
41
+ File.open(data_r_path) do |data_r|
42
+ data_r.each_line do |line|
43
+ case line.chomp
44
+ when /\A#'/
45
+ comment_content = Regexp.last_match.post_match
46
+ unless comment_content.empty?
47
+ comment_content = comment_content[1..-1]
48
+ end
49
+ comment << comment_content
50
+ comment << "\n"
51
+ when /\A"(.+)"\z/
52
+ name = Regexp.last_match[1]
53
+ descriptions[name] = parse_roxygen(comment.rstrip)
54
+ comment = ""
55
+ end
56
+ end
57
+ descriptions[@ggplot2_dataset_name]
58
+ end
59
+ end
60
+
61
+ def parse_roxygen(roxygen)
62
+ column_name_mapping = self.class::COLUMN_NAME_MAPPING
63
+ roxygen
64
+ .gsub(/\\url\{(.*?)\}/, "\\1")
65
+ .gsub(/^@format /, "")
66
+ .gsub(/\\describe\{(.*)\}/m) do
67
+ content = $1
68
+ content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
69
+ column_name = $1
70
+ description = $2
71
+ column_name = column_name_mapping[column_name] || column_name
72
+ description = description
73
+ .gsub(/\\\$/, "$")
74
+ "* #{column_name}: #{description}"
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -163,6 +163,7 @@ module Datasets
163
163
  @metadata.id = "hepatitis"
164
164
  @metadata.name = "Hepatitis"
165
165
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
166
+ @metadata.licenses = ["CC-BY-4.0"]
166
167
  @metadata.description = lambda do
167
168
  read_names
168
169
  end
@@ -186,10 +187,8 @@ module Datasets
186
187
 
187
188
  def open_data
188
189
  data_path = cache_dir_path + "hepatitis.csv"
189
- unless data_path.exist?
190
- data_url = "#{base_url}/hepatitis.data"
191
- download(data_path, data_url)
192
- end
190
+ data_url = "#{base_url}/hepatitis.data"
191
+ download(data_path, data_url)
193
192
  CSV.open(data_path) do |csv|
194
193
  yield(csv)
195
194
  end
@@ -197,10 +196,8 @@ module Datasets
197
196
 
198
197
  def read_names
199
198
  names_path = cache_dir_path + "hepatitis.names"
200
- unless names_path.exist?
201
- names_url = "#{base_url}/hepatitis.names"
202
- download(names_path, names_url)
203
- end
199
+ names_url = "#{base_url}/hepatitis.names"
200
+ download(names_path, names_url)
204
201
  names_path.read
205
202
  end
206
203
  end
data/lib/datasets/iris.rb CHANGED
@@ -15,6 +15,7 @@ module Datasets
15
15
  @metadata.id = "iris"
16
16
  @metadata.name = "Iris"
17
17
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
18
+ @metadata.licenses = ["CC-BY-4.0"]
18
19
  @metadata.description = lambda do
19
20
  read_names
20
21
  end
@@ -35,10 +36,8 @@ module Datasets
35
36
  private
36
37
  def open_data
37
38
  data_path = cache_dir_path + "iris.csv"
38
- unless data_path.exist?
39
- data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40
- download(data_path, data_url)
41
- end
39
+ data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40
+ download(data_path, data_url)
42
41
  CSV.open(data_path, converters: [:numeric]) do |csv|
43
42
  yield(csv)
44
43
  end
@@ -46,10 +45,8 @@ module Datasets
46
45
 
47
46
  def read_names
48
47
  names_path = cache_dir_path + "iris.names"
49
- unless names_path.exist?
50
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
51
- download(names_path, names_url)
52
- end
48
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
49
+ download(names_path, names_url)
53
50
  names_path.read
54
51
  end
55
52
  end
@@ -0,0 +1,57 @@
1
+ require_relative 'dataset'
2
+
3
+ module Datasets
4
+ class ITACorpus < Dataset
5
+ Record = Struct.new(:id,
6
+ :sentence)
7
+
8
+ def initialize(type: :emotion)
9
+ unless [:emotion, :recitation].include?(type)
10
+ raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
11
+ end
12
+
13
+ super()
14
+ @type = type
15
+ @metadata.id = 'ita-corpus'
16
+ @metadata.name = 'ITA-corpus'
17
+ @metadata.url = 'https://github.com/mmorise/ita-corpus'
18
+ @metadata.licenses = ['Unlicense']
19
+ @metadata.description = lambda do
20
+ fetch_readme
21
+ end
22
+ end
23
+
24
+ def each(&block)
25
+ return to_enum(__method__) unless block_given?
26
+
27
+ data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
28
+ data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
29
+ download(data_path, data_url)
30
+
31
+ parse_data(data_path, &block)
32
+ end
33
+
34
+ private
35
+ def fetch_readme
36
+ readme_base_name = "README.md"
37
+ readme_path = cache_dir_path + readme_base_name
38
+ readme_url = "#{download_base_url}/#{readme_base_name}"
39
+ download(readme_path, readme_url)
40
+ readme_path.read.split(/^## ファイル構成/, 2)[0].strip
41
+ end
42
+
43
+ def download_base_url
44
+ "https://raw.githubusercontent.com/mmorise/ita-corpus/main"
45
+ end
46
+
47
+ def parse_data(data_path)
48
+ File.open(data_path) do |f|
49
+ f.each_line(chomp: true) do |line|
50
+ id, sentence = line.split(':', 2)
51
+ record = Record.new(id , sentence)
52
+ yield(record)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,16 @@
1
+ require_relative 'mnist'
2
+
3
+ module Datasets
4
+ class KuzushijiMNIST < MNIST
5
+ BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
6
+
7
+ private
8
+ def dataset_name
9
+ "Kuzushiji-MNIST"
10
+ end
11
+
12
+ def licenses
13
+ ["CC-BY-SA-4.0"]
14
+ end
15
+ end
16
+ end