red-datasets 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1cfd18b589e4624178d9010ef68a100bb6e2573ccf18a9f96168af786523578
4
- data.tar.gz: 67eddd22e10bf78c0b2cf10b18de289368d473d7b5ddf2a557cc2264834e32b0
3
+ metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
4
+ data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
5
5
  SHA512:
6
- metadata.gz: 111243d3a1d3d758196bb71301ccb0f34beb1f5bec7c5c14b15f7c96fd6bdde924e30d90d3ace9e9258074411c9f7e7b4ef6bd9338dc5c11349534b2392f6f81
7
- data.tar.gz: 9a9b426c753bd7e6cc12d452d61b90c2422fcad3b3c353a552c5c05a7c7fd53c3d4ac9cec2e33af1537d9e76e04f1df3d6d9b4baf043528fdde2ab4f9f203e9f
6
+ metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
7
+ data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252
data/README.md CHANGED
@@ -1,6 +1,5 @@
1
1
  # Red Datasets
2
2
 
3
- [![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
4
3
  [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
5
4
 
6
5
  ## Description
data/Rakefile CHANGED
@@ -13,9 +13,64 @@ end
13
13
  helper.install
14
14
  spec = helper.gemspec
15
15
 
16
+ task default: :test
17
+
16
18
  desc "Run tests"
17
19
  task :test do
18
20
  ruby("test/run-test.rb")
19
21
  end
20
22
 
21
- task default: :test
23
+ desc "Generate an artifact for GitHub Pages"
24
+ task :pages do
25
+ pages_dir = "_site"
26
+ rm_rf(pages_dir)
27
+ mkdir_p(pages_dir)
28
+
29
+ require "cgi/util"
30
+ require_relative "lib/datasets/lazy"
31
+ File.open("#{pages_dir}/index.html", "w") do |index_html|
32
+ index_html.puts(<<-HTML)
33
+ <!DOCTYPE html>
34
+ <html>
35
+ <head>
36
+ <meta charset="UTF-8">
37
+ <title>Red Datasets</title>
38
+ <style>
39
+ table {
40
+ margin-left: 20vw;
41
+ min-width: 50%;
42
+ }
43
+ th {
44
+ font-size: 30px;
45
+ padding: 20px;
46
+ }
47
+ td {
48
+ border-bottom: 1px solid #D9DCE0;
49
+ padding: 20px;
50
+ font-weight: bold;
51
+ }
52
+ </style>
53
+ </head>
54
+ <body>
55
+ <section>
56
+ <h1>Red Datasets</h1>
57
+ <table>
58
+ <thead>
59
+ <tr><th>Available datasets</th></tr>
60
+ </thead>
61
+ <tbody>
62
+ HTML
63
+ Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
64
+ index_html.puts(<<-HTML)
65
+ <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
66
+ HTML
67
+ end
68
+ index_html.puts(<<-HTML)
69
+ </tbody>
70
+ </table>
71
+ </section>
72
+ </body>
73
+ </html>
74
+ HTML
75
+ end
76
+ end
data/doc/text/news.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # News
2
2
 
3
+ ## 0.1.6 - 2023-05-24
4
+
5
+ ### Improvements
6
+
7
+ * Added support for lazy loading by `require "datasets/lazy"`.
8
+
9
+ * `Datasets::NagoyaUniversityConversationCorpus`: Added.
10
+ [GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
11
+ [Patch by matsuura]
12
+
13
+ * `Datasets::Wikipedia`: Added support for downloading in background.
14
+
15
+ ### Thanks
16
+
17
+ * matsuura
18
+
3
19
  ## 0.1.5 - 2022-09-22
4
20
 
5
21
  ### Improvements
@@ -33,20 +33,59 @@ module Datasets
33
33
  @cache_path ||= CachePath.new(@metadata.id)
34
34
  end
35
35
 
36
- def download(output_path, url)
36
+ def download(output_path, url, &block)
37
37
  downloader = Downloader.new(url)
38
- downloader.download(output_path)
38
+ downloader.download(output_path, &block)
39
39
  end
40
40
 
41
- def extract_bz2(path)
42
- input, output = IO.pipe
43
- pid = spawn("bzcat", path.to_s, {:out => output})
44
- begin
45
- output.close
46
- yield(input)
47
- ensure
48
- input.close
49
- Process.waitpid(pid)
41
+ def extract_bz2(bz2)
42
+ case bz2
43
+ when Pathname, String
44
+ IO.pipe do |input, output|
45
+ pid = spawn("bzcat", bz2.to_s, {out: output})
46
+ begin
47
+ output.close
48
+ yield(input)
49
+ ensure
50
+ input.close
51
+ Process.waitpid(pid)
52
+ end
53
+ end
54
+ else
55
+ IO.pipe do |bz2_input, bz2_output|
56
+ IO.pipe do |plain_input, plain_output|
57
+ bz2_stop = false
58
+ bz2_thread = Thread.new do
59
+ begin
60
+ bz2.each do |chunk|
61
+ bz2_output.write(chunk)
62
+ bz2_output.flush
63
+ break if bz2_stop
64
+ end
65
+ rescue => error
66
+ message = "Failed to read bzcat input: " +
67
+ "#{error.class}: #{error.message}"
68
+ $stderr.puts(message)
69
+ ensure
70
+ bz2_output.close
71
+ end
72
+ end
73
+ begin
74
+ pid = spawn("bzcat", {in: bz2_input, out: plain_output})
75
+ begin
76
+ bz2_input.close
77
+ plain_output.close
78
+ yield(plain_input)
79
+ ensure
80
+ plain_input.close
81
+ Process.waitpid(pid)
82
+ end
83
+ ensure
84
+ bz2_stop = true
85
+ bz2_thread.join
86
+ end
87
+ end
88
+ end
50
89
  end
51
90
  end
52
91
  end
@@ -22,50 +22,115 @@ module Datasets
22
22
  end
23
23
  end
24
24
 
25
- def download(output_path)
26
- return if output_path.exist?
27
-
28
- output_path.parent.mkpath
25
+ def download(output_path, &block)
26
+ if output_path.exist?
27
+ yield_chunks(output_path, &block) if block_given?
28
+ return
29
+ end
29
30
 
30
- headers = {
31
- "Accept-Encoding" => "identity",
32
- "User-Agent" => "Red Datasets/#{VERSION}",
33
- }
34
- start = nil
35
31
  partial_output_path = Pathname.new("#{output_path}.partial")
36
- if partial_output_path.exist?
37
- start = partial_output_path.size
38
- headers["Range"] = "bytes=#{start}-"
39
- end
32
+ synchronize(output_path, partial_output_path) do
33
+ output_path.parent.mkpath
40
34
 
41
- start_http(@url, headers) do |response|
42
- if response.is_a?(Net::HTTPPartialContent)
43
- mode = "ab"
44
- else
35
+ n_retries = 0
36
+ n_max_retries = 5
37
+ begin
38
+ headers = {
39
+ "Accept-Encoding" => "identity",
40
+ "User-Agent" => "Red Datasets/#{VERSION}",
41
+ }
45
42
  start = nil
46
- mode = "wb"
47
- end
43
+ if partial_output_path.exist?
44
+ start = partial_output_path.size
45
+ headers["Range"] = "bytes=#{start}-"
46
+ end
47
+
48
+ start_http(@url, headers) do |response|
49
+ if response.is_a?(Net::HTTPPartialContent)
50
+ mode = "ab"
51
+ else
52
+ start = nil
53
+ mode = "wb"
54
+ end
48
55
 
49
- base_name = @url.path.split("/").last
50
- size_current = 0
51
- size_max = response.content_length
52
- if start
53
- size_current += start
54
- size_max += start
56
+ base_name = @url.path.split("/").last
57
+ size_current = 0
58
+ size_max = response.content_length
59
+ if start
60
+ size_current += start
61
+ size_max += start
62
+ if block_given? and n_retries.zero?
63
+ yield_chunks(partial_output_path, &block)
64
+ end
65
+ end
66
+ progress_reporter = ProgressReporter.new(base_name, size_max)
67
+ partial_output_path.open(mode) do |output|
68
+ response.read_body do |chunk|
69
+ size_current += chunk.bytesize
70
+ progress_reporter.report(size_current)
71
+ output.write(chunk)
72
+ yield(chunk) if block_given?
73
+ end
74
+ end
75
+ end
76
+ FileUtils.mv(partial_output_path, output_path)
77
+ rescue Net::ReadTimeout => error
78
+ n_retries += 1
79
+ retry if n_retries < n_max_retries
80
+ raise
81
+ rescue TooManyRedirects => error
82
+ last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
83
+ raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
55
84
  end
56
- progress_reporter = ProgressReporter.new(base_name, size_max)
57
- partial_output_path.open(mode) do |output|
58
- response.read_body do |chunk|
59
- size_current += chunk.bytesize
60
- progress_reporter.report(size_current)
61
- output.write(chunk)
85
+ end
86
+ end
87
+
88
+ private def synchronize(output_path, partial_output_path)
89
+ begin
90
+ Process.getpgid(Process.pid)
91
+ rescue NotImplementedError
92
+ return yield
93
+ end
94
+
95
+ lock_path = Pathname("#{output_path}.lock")
96
+ loop do
97
+ lock_path.parent.mkpath
98
+ begin
99
+ lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
100
+ rescue SystemCallError
101
+ valid_lock_path = true
102
+ begin
103
+ pid = Integer(lock_path.read.chomp, 10)
104
+ rescue ArgumentError
105
+ # The process that acquired the lock will be exited before
106
+ # it stores its process ID.
107
+ valid_lock_path = (lock_path.mtime > 10)
108
+ else
109
+ begin
110
+ Process.getpgid(pid)
111
+ rescue SystemCallError
112
+ # Process that acquired the lock doesn't exist
113
+ valid_lock_path = false
114
+ end
115
+ end
116
+ if valid_lock_path
117
+ sleep(1 + rand(10))
118
+ else
119
+ lock_path.delete
62
120
  end
121
+ retry
122
+ else
123
+ begin
124
+ lock.puts(Process.pid.to_s)
125
+ lock.flush
126
+ yield
127
+ ensure
128
+ lock.close
129
+ lock_path.delete
130
+ end
131
+ break
63
132
  end
64
133
  end
65
- FileUtils.mv(partial_output_path, output_path)
66
- rescue TooManyRedirects => error
67
- last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
68
- raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
69
134
  end
70
135
 
71
136
  private def start_http(url, headers, limit = 10, &block)
@@ -99,6 +164,16 @@ module Datasets
99
164
  end
100
165
  end
101
166
 
167
+ private def yield_chunks(path)
168
+ path.open("rb") do |output|
169
+ chunk_size = 1024 * 1024
170
+ chunk = ""
171
+ while output.read(chunk_size, chunk)
172
+ yield(chunk)
173
+ end
174
+ end
175
+ end
176
+
102
177
  class ProgressReporter
103
178
  def initialize(base_name, size_max)
104
179
  @base_name = base_name
@@ -0,0 +1,90 @@
1
+ require_relative "version"
2
+
3
+ module Datasets
4
+ class LazyLoader
5
+ def initialize
6
+ @constants = {}
7
+ end
8
+
9
+ def exist?(constant_name)
10
+ @constants.key?(constant_name)
11
+ end
12
+
13
+ def load(constant_name)
14
+ feature = @constants[constant_name]
15
+ raise LoadError, "unknown dataset: #{constant_name}" unless feature
16
+ require feature
17
+ end
18
+
19
+ def load_all
20
+ @constants.each_value do |feature|
21
+ require feature
22
+ end
23
+ end
24
+
25
+ def register(constant_name, feature)
26
+ @constants[constant_name] = feature
27
+ end
28
+
29
+ def constant_names
30
+ @constants.keys
31
+ end
32
+ end
33
+
34
+ LAZY_LOADER = LazyLoader.new
35
+
36
+ class << self
37
+ def const_missing(name)
38
+ if LAZY_LOADER.exist?(name)
39
+ LAZY_LOADER.load(name)
40
+ const_get(name)
41
+ else
42
+ super
43
+ end
44
+ end
45
+ end
46
+
47
+ LAZY_LOADER.register(:Adult, "datasets/adult")
48
+ LAZY_LOADER.register(:AFINN, "datasets/afinn")
49
+ LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
50
+ LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
51
+ LAZY_LOADER.register(:CIFAR, "datasets/cifar")
52
+ LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
53
+ LAZY_LOADER.register(:Communities, "datasets/communities")
54
+ LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
55
+ LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
56
+ LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
57
+ LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
58
+ LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
59
+ LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
60
+ LAZY_LOADER.register(:Iris, "datasets/iris")
61
+ LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
62
+ LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
63
+ LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
64
+ LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
65
+ LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
66
+ LAZY_LOADER.register(:MNIST, "datasets/mnist")
67
+ LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
68
+ LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
69
+ "datasets/nagoya-university-conversation-corpus")
70
+ LAZY_LOADER.register(:Penguins, "datasets/penguins")
71
+ LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
72
+ LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
73
+ LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
74
+ LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
75
+ "datasets/quora-duplicate-question-pair")
76
+ LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
77
+ # For backward compatibility
78
+ LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
79
+ LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
80
+ # For backward compatibility
81
+ LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
82
+ LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
83
+ LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
84
+ LAZY_LOADER.register(:SudachiSynonymDictionary,
85
+ "datasets/sudachi-synonym-dictionary")
86
+ LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
87
+ LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
88
+ "datasets/wikipedia-kyoto-japanese-english")
89
+ LAZY_LOADER.register(:Wine, "datasets/wine")
90
+ end
@@ -0,0 +1,109 @@
1
+ require_relative 'dataset'
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ class NagoyaUniversityConversationCorpus < Dataset
6
+ Data = Struct.new(
7
+ :name,
8
+ :date,
9
+ :place,
10
+ :participants,
11
+ :relationships,
12
+ :note,
13
+ :sentences
14
+ )
15
+
16
+ Participant = Struct.new(
17
+ :id,
18
+ :attribute,
19
+ :birthplace,
20
+ :residence
21
+ )
22
+
23
+ Sentence = Struct.new(:participant_id, :content) do
24
+ def end?
25
+ participant_id.nil? and content.nil?
26
+ end
27
+ end
28
+
29
+ def initialize
30
+ super()
31
+ @metadata.id = 'nagoya-university-conversation-curpus'
32
+ @metadata.name = 'Nagoya University Conversation Curpus'
33
+ @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
34
+ @metadata.licenses = ['CC-BY-NC-ND-4.0']
35
+ @metadata.description = <<~DESCRIPTION
36
+ The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
37
+ total about 100 hours of chatting among native speakers of Japanese,
38
+ which is converted into text.
39
+ DESCRIPTION
40
+ end
41
+
42
+ def each
43
+ return to_enum(__method__) unless block_given?
44
+
45
+ open_data do |input_stream|
46
+ yield(parse_file(input_stream))
47
+ end
48
+ end
49
+
50
+ private
51
+
52
+ def open_data
53
+ data_path = cache_dir_path + 'nucc.zip'
54
+ data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
55
+ download(data_path, data_url)
56
+
57
+ extractor = ZipExtractor.new(data_path)
58
+ extractor.extract_files do |input_stream|
59
+ yield(input_stream)
60
+ end
61
+ end
62
+
63
+ def parse_file(input_stream)
64
+ data = Data.new
65
+ participants = []
66
+ sentences = []
67
+
68
+ input_stream.each do |input|
69
+ input.each_line(chomp: true) do |line|
70
+ line.force_encoding('utf-8')
71
+ if line.start_with?('@データ')
72
+ data.name = line[4..]
73
+ elsif line.start_with?('@収集年月日')
74
+ # mixed cases with and without':'
75
+ data.date = line[6..].delete_prefix(':')
76
+ elsif line.start_with?('@場所')
77
+ data.place = line[4..]
78
+ elsif line.start_with?('@参加者の関係')
79
+ data.relationships = line.split(':', 2)[1]
80
+ elsif line.start_with?('@参加者')
81
+ participant = Participant.new
82
+ participant.id, profiles = line[4..].split(':', 2)
83
+ participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
84
+
85
+ participants << participant
86
+ elsif line.start_with?('%com')
87
+ data.note = line.split(':', 2)[1]
88
+ elsif line == '@END'
89
+ sentence = Sentence.new
90
+ sentence.participant_id = nil
91
+ sentence.content = nil
92
+
93
+ sentences << sentence
94
+ else
95
+ sentence = Sentence.new
96
+ sentence.participant_id, sentence.content = line.split(':', 2)
97
+
98
+ sentences << sentence
99
+ end
100
+ end
101
+ end
102
+
103
+ data.participants = participants
104
+ data.sentences = sentences
105
+
106
+ data
107
+ end
108
+ end
109
+ end
@@ -1,3 +1,5 @@
1
+ require "csv"
2
+
1
3
  require_relative "dataset"
2
4
 
3
5
  module Datasets
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -53,13 +53,22 @@ module Datasets
53
53
  end
54
54
 
55
55
  private
56
+ def base_name
57
+ "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
58
+ end
59
+
60
+ def data_path
61
+ cache_dir_path + base_name
62
+ end
63
+
56
64
  def open_data(&block)
57
- base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
58
- data_path = cache_dir_path + base_name
59
65
  data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
60
- download(data_path, data_url)
61
-
62
- extract_bz2(data_path, &block)
66
+ bz2 = Enumerator.new do |yielder|
67
+ download(data_path, data_url) do |bz2_chunk|
68
+ yielder << bz2_chunk
69
+ end
70
+ end
71
+ extract_bz2(bz2, &block)
63
72
  end
64
73
 
65
74
  def type_in_path
@@ -32,5 +32,17 @@ module Datasets
32
32
  end
33
33
  nil
34
34
  end
35
+
36
+ def extract_files
37
+ Zip::File.open(@path) do |zip_file|
38
+ zip_file.each do |entry|
39
+ next unless entry.file?
40
+
41
+ entry.get_input_stream do |input|
42
+ yield(input)
43
+ end
44
+ end
45
+ end
46
+ end
35
47
  end
36
48
  end
data/lib/datasets.rb CHANGED
@@ -1,34 +1,2 @@
1
- require_relative "datasets/version"
2
-
3
- require_relative "datasets/adult"
4
- require_relative "datasets/afinn"
5
- require_relative "datasets/aozora-bunko"
6
- require_relative "datasets/california-housing"
7
- require_relative "datasets/cifar"
8
- require_relative "datasets/cldr-plurals"
9
- require_relative "datasets/communities"
10
- require_relative "datasets/diamonds"
11
- require_relative "datasets/e-stat-japan"
12
- require_relative "datasets/fashion-mnist"
13
- require_relative "datasets/fuel-economy"
14
- require_relative "datasets/geolonia"
15
- require_relative "datasets/hepatitis"
16
- require_relative "datasets/iris"
17
- require_relative "datasets/ita-corpus"
18
- require_relative "datasets/kuzushiji-mnist"
19
- require_relative "datasets/libsvm"
20
- require_relative "datasets/libsvm-dataset-list"
21
- require_relative "datasets/livedoor-news"
22
- require_relative "datasets/mnist"
23
- require_relative "datasets/mushroom"
24
- require_relative "datasets/penguins"
25
- require_relative "datasets/penn-treebank"
26
- require_relative "datasets/pmjt-dataset-list"
27
- require_relative "datasets/postal-code-japan"
28
- require_relative "datasets/quora-duplicate-question-pair"
29
- require_relative "datasets/rdataset"
30
- require_relative "datasets/seaborn"
31
- require_relative "datasets/sudachi-synonym-dictionary"
32
- require_relative "datasets/wikipedia"
33
- require_relative "datasets/wikipedia-kyoto-japanese-english"
34
- require_relative "datasets/wine"
1
+ require_relative "datasets/lazy"
2
+ Datasets::LAZY_LOADER.load_all
@@ -6,7 +6,7 @@ class GeoloniaTest < Test::Unit::TestCase
6
6
  test('#each') do
7
7
  records = @dataset.each.to_a
8
8
  assert_equal([
9
- 277191,
9
+ 277616,
10
10
  {
11
11
  :prefecture_code => "01",
12
12
  :prefecture_name => "北海道",
@@ -28,16 +28,16 @@ class GeoloniaTest < Test::Unit::TestCase
28
28
  :prefecture_name => "沖縄県",
29
29
  :prefecture_kana => "オキナワケン",
30
30
  :prefecture_romaji => "OKINAWA KEN",
31
- :municipality_code => "47325",
32
- :municipality_name => "中頭郡嘉手納町",
33
- :municipality_kana => "ナカガミグンカデナチョウ",
34
- :municipality_romaji => "NAKAGAMI GUN KADENA CHO",
35
- :street_name => "字兼久",
31
+ :municipality_code => "47382",
32
+ :municipality_name => "八重山郡与那国町",
33
+ :municipality_kana => "ヤエヤマグンヨナグニチョウ",
34
+ :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
35
+ :street_name => "字与那国",
36
36
  :street_kana => nil,
37
37
  :street_romaji => nil,
38
- :alias => "下原",
39
- :latitude => "26.351841",
40
- :longitude => "127.744975",
38
+ :alias => nil,
39
+ :latitude => "24.455925",
40
+ :longitude => "122.987678",
41
41
  },
42
42
  ],
43
43
  [
@@ -55,6 +55,7 @@ class GeoloniaTest < Test::Unit::TestCase
55
55
  "## 住所データ仕様",
56
56
  "### ファイルフォーマット",
57
57
  "### 列",
58
+ "### ソート順",
58
59
  ],
59
60
  description.scan(/^#.*$/),
60
61
  description)
@@ -0,0 +1,132 @@
1
+ class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::NagoyaUniversityConversationCorpus.new
4
+ end
5
+
6
+ sub_test_case("each") do
7
+ test("#sentences") do
8
+ records = @dataset.each.to_a
9
+ first_sentences = records[0].sentences
10
+ last_sentences = records[-1].sentences
11
+ assert_equal([
12
+ 856,
13
+ {
14
+ participant_id: 'F107',
15
+ content: '***の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても1時間ぐらいですよね。'
16
+ },
17
+ {
18
+ participant_id: nil,
19
+ content: nil
20
+ },
21
+ 603,
22
+ {
23
+ participant_id: 'F007',
24
+ content: 'それでは話を始めまーす。'
25
+ },
26
+ {
27
+ participant_id: nil,
28
+ content: nil
29
+ }
30
+ ],
31
+ [
32
+ first_sentences.size,
33
+ first_sentences[0].to_h,
34
+ first_sentences[-1].to_h,
35
+ last_sentences.size,
36
+ last_sentences[0].to_h,
37
+ last_sentences[-1].to_h,
38
+ ])
39
+ end
40
+
41
+ test("#participants") do
42
+ records = @dataset.each.to_a
43
+ first_participants = records[0].participants
44
+ last_participants = records[-1].participants
45
+ assert_equal([
46
+ 4,
47
+ {
48
+ id: 'F107',
49
+ attribute: '女性30代後半',
50
+ birthplace: '愛知県幡豆郡出身',
51
+ residence: '愛知県幡豆郡在住'
52
+ },
53
+ {
54
+ id: 'F128',
55
+ attribute: '女性20代前半',
56
+ birthplace: '愛知県西尾市出身',
57
+ residence: '西尾市在住'
58
+ },
59
+ 2,
60
+ {
61
+ id: 'F007',
62
+ attribute: '女性50代後半',
63
+ birthplace: '東京都出身',
64
+ residence: '東京都国分寺市在住'
65
+ },
66
+ {
67
+ id: 'F003',
68
+ attribute: '女性80代後半',
69
+ birthplace: '栃木県宇都宮市出身',
70
+ residence: '国分寺市在住'
71
+ }
72
+ ],
73
+ [
74
+ first_participants.size,
75
+ first_participants[0].to_h,
76
+ first_participants[-1].to_h,
77
+ last_participants.size,
78
+ last_participants[0].to_h,
79
+ last_participants[-1].to_h
80
+ ])
81
+ end
82
+
83
+ test("others") do
84
+ records = @dataset.each.to_a
85
+ assert_equal([
86
+ 129,
87
+ [
88
+ '1(約35分)',
89
+ '2001年10月16日',
90
+ 'ファミリーレストラン',
91
+ '英会話教室の友人',
92
+ nil
93
+ ],
94
+ [
95
+ '129(36分)',
96
+ '2003年2月16日',
97
+ '二人の自宅',
98
+ '母と娘',
99
+ 'F007は東京に38年、F003は東京に60年居住。'
100
+ ]
101
+ ],
102
+ [
103
+ records.size,
104
+ [
105
+ records[0].name,
106
+ records[0].date,
107
+ records[0].place,
108
+ records[0].relationships,
109
+ records[0].note
110
+ ],
111
+ [
112
+ records[-1].name,
113
+ records[-1].date,
114
+ records[-1].place,
115
+ records[-1].relationships,
116
+ records[-1].note
117
+ ]
118
+ ])
119
+ end
120
+ end
121
+
122
+ sub_test_case("#metadata") do
123
+ test("#description") do
124
+ description = @dataset.metadata.description
125
+ assert_equal(<<~DESCRIPTION, description)
126
+ The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
127
+ total about 100 hours of chatting among native speakers of Japanese,
128
+ which is converted into text.
129
+ DESCRIPTION
130
+ end
131
+ end
132
+ end
@@ -8,7 +8,7 @@ class RdatasetTest < Test::Unit::TestCase
8
8
  test("with package_name") do
9
9
  records = @dataset.filter(package: "datasets").to_a
10
10
  assert_equal([
11
- 84,
11
+ 102,
12
12
  {
13
13
  package: "datasets",
14
14
  dataset: "ability.cov",
@@ -48,7 +48,7 @@ class RdatasetTest < Test::Unit::TestCase
48
48
  test("without package_name") do
49
49
  records = @dataset.each.to_a
50
50
  assert_equal([
51
- 1892,
51
+ 2142,
52
52
  {
53
53
  package: "AER",
54
54
  dataset: "Affairs",
data/test/test-seaborn.rb CHANGED
@@ -14,6 +14,7 @@ class SeabornTest < Test::Unit::TestCase
14
14
  {dataset: "car_crashes"},
15
15
  {dataset: "diamonds"},
16
16
  {dataset: "dots"},
17
+ {dataset: "dowjones"},
17
18
  {dataset: "exercise"},
18
19
  {dataset: "flights"},
19
20
  {dataset: "fmri"},
@@ -6,7 +6,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
6
6
  test('#each') do
7
7
  records = @dataset.each.to_a
8
8
  assert_equal([
9
- 65182,
9
+ 65206,
10
10
  {
11
11
  group_id: "000001",
12
12
  is_noun: true,
@@ -19,7 +19,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
19
19
  notation: "曖昧",
20
20
  },
21
21
  {
22
- group_id: "024909",
22
+ group_id: "024916",
23
23
  is_noun: true,
24
24
  expansion_type: :expanded,
25
25
  lexeme_id: 1,
@@ -27,7 +27,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
27
27
  acronym_type: :alphabet,
28
28
  variant_type: :typical,
29
29
  categories: ["ビジネス"],
30
- notation: "BPO",
30
+ notation: "SCM",
31
31
  },
32
32
  ],
33
33
  [
@@ -1,100 +1,54 @@
1
1
  class WikipediaTest < Test::Unit::TestCase
2
- sub_test_case("ja") do
2
+ sub_test_case("en") do
3
3
  sub_test_case("articles") do
4
- include Helper::Sandbox
5
-
6
4
  def setup
7
- setup_sandbox
8
- @dataset = Datasets::Wikipedia.new(language: :ja,
5
+ @dataset = Datasets::Wikipedia.new(language: :en,
9
6
  type: :articles)
10
- def @dataset.cache_dir_path
11
- @cache_dir_path
12
- end
13
- def @dataset.cache_dir_path=(path)
14
- @cache_dir_path = path
15
- end
16
- @dataset.cache_dir_path = @tmp_dir
17
- end
18
-
19
- def teardown
20
- teardown_sandbox
21
7
  end
22
8
 
23
9
  test("#each") do
24
- def @dataset.download(output_path, url)
25
- xml_path = output_path.sub_ext("")
26
- xml_path.open("w") do |xml_file|
27
- xml_file.puts(<<-XML)
28
- <mediawiki
29
- xmlns="http://www.mediawiki.org/xml/export-0.10/"
30
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
31
- xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
32
- version="0.10" xml:lang="ja">
33
- <siteinfo>
34
- <sitename>Wikipedia</sitename>
35
- </siteinfo>
36
- <page>
37
- <title>タイトル</title>
38
- <ns>4</ns>
39
- <id>1</id>
40
- <restrictions>sysop</restrictions>
41
- <revision>
42
- <id>3</id>
43
- <parentid>2</parentid>
44
- <timestamp>2004-04-30T14:46:00Z</timestamp>
45
- <contributor>
46
- <username>user</username>
47
- <id>10</id>
48
- </contributor>
49
- <minor />
50
- <comment>コメント</comment>
51
- <model>wikitext</model>
52
- <format>text/x-wiki</format>
53
- <text xml:space="preserve">テキスト</text>
54
- <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
55
- </revision>
56
- </page>
57
- </mediawiki>
58
- XML
59
- end
60
- unless system("bzip2", xml_path.to_s)
61
- raise "failed to run bzip2"
62
- end
63
- end
64
-
65
- contributor = Datasets::Wikipedia::Contributor.new("user", 10)
10
+ contributor = Datasets::Wikipedia::Contributor.new("Elli", 20842734)
66
11
  revision = Datasets::Wikipedia::Revision.new
67
- revision.id = 3
68
- revision.parent_id = 2
69
- revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
12
+ revision.id = 1002250816
13
+ revision.parent_id = 854851586
14
+ revision.timestamp = Time.iso8601("2021-01-23T15:15:01Z")
70
15
  revision.contributor = contributor
71
- revision.comment = "コメント"
16
+ revision.comment = "shel"
72
17
  revision.model = "wikitext"
73
18
  revision.format = "text/x-wiki"
74
- revision.text = "テキスト"
75
- revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
19
+ revision.text = <<-TEXT.chomp
20
+ #REDIRECT [[Computer accessibility]]
21
+
22
+ {{rcat shell|
23
+ {{R from move}}
24
+ {{R from CamelCase}}
25
+ {{R unprintworthy}}
26
+ }}
27
+ TEXT
28
+ revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
76
29
  page = Datasets::Wikipedia::Page.new
77
- page.title = "タイトル"
78
- page.namespace = 4
79
- page.id = 1
80
- page.restrictions = ["sysop"]
30
+ page.title = "AccessibleComputing"
31
+ page.namespace = 0
32
+ page.id = 10
33
+ page.restrictions = nil
34
+ page.redirect = "Computer accessibility"
81
35
  page.revision = revision
82
36
  assert_equal(page, @dataset.each.first)
83
37
  end
84
38
 
85
39
  sub_test_case("#metadata") do
86
40
  test("#id") do
87
- assert_equal("wikipedia-ja-articles",
41
+ assert_equal("wikipedia-en-articles",
88
42
  @dataset.metadata.id)
89
43
  end
90
44
 
91
45
  test("#name") do
92
- assert_equal("Wikipedia articles (ja)",
46
+ assert_equal("Wikipedia articles (en)",
93
47
  @dataset.metadata.name)
94
48
  end
95
49
 
96
50
  test("#description") do
97
- assert_equal("Wikipedia articles in ja",
51
+ assert_equal("Wikipedia articles in en",
98
52
  @dataset.metadata.description)
99
53
  end
100
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2022-09-23 00:00:00.000000000 Z
12
+ date: 2023-05-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -163,6 +163,7 @@ files:
163
163
  - lib/datasets/iris.rb
164
164
  - lib/datasets/ita-corpus.rb
165
165
  - lib/datasets/kuzushiji-mnist.rb
166
+ - lib/datasets/lazy.rb
166
167
  - lib/datasets/libsvm-dataset-list.rb
167
168
  - lib/datasets/libsvm.rb
168
169
  - lib/datasets/license.rb
@@ -170,6 +171,7 @@ files:
170
171
  - lib/datasets/metadata.rb
171
172
  - lib/datasets/mnist.rb
172
173
  - lib/datasets/mushroom.rb
174
+ - lib/datasets/nagoya-university-conversation-corpus.rb
173
175
  - lib/datasets/penguins.rb
174
176
  - lib/datasets/penn-treebank.rb
175
177
  - lib/datasets/pmjt-dataset-list.rb
@@ -214,6 +216,7 @@ files:
214
216
  - test/test-metadata.rb
215
217
  - test/test-mnist.rb
216
218
  - test/test-mushroom.rb
219
+ - test/test-nagoya-university-conversation-corpus.rb
217
220
  - test/test-penguins.rb
218
221
  - test/test-penn-treebank.rb
219
222
  - test/test-pmjt-dataset-list.rb
@@ -245,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
245
248
  - !ruby/object:Gem::Version
246
249
  version: '0'
247
250
  requirements: []
248
- rubygems_version: 3.2.32
251
+ rubygems_version: 3.5.0.dev
249
252
  signing_key:
250
253
  specification_version: 4
251
254
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
@@ -278,6 +281,7 @@ test_files:
278
281
  - test/test-metadata.rb
279
282
  - test/test-mnist.rb
280
283
  - test/test-mushroom.rb
284
+ - test/test-nagoya-university-conversation-corpus.rb
281
285
  - test/test-penguins.rb
282
286
  - test/test-penn-treebank.rb
283
287
  - test/test-pmjt-dataset-list.rb