red-datasets 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1cfd18b589e4624178d9010ef68a100bb6e2573ccf18a9f96168af786523578
4
- data.tar.gz: 67eddd22e10bf78c0b2cf10b18de289368d473d7b5ddf2a557cc2264834e32b0
3
+ metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
4
+ data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
5
5
  SHA512:
6
- metadata.gz: 111243d3a1d3d758196bb71301ccb0f34beb1f5bec7c5c14b15f7c96fd6bdde924e30d90d3ace9e9258074411c9f7e7b4ef6bd9338dc5c11349534b2392f6f81
7
- data.tar.gz: 9a9b426c753bd7e6cc12d452d61b90c2422fcad3b3c353a552c5c05a7c7fd53c3d4ac9cec2e33af1537d9e76e04f1df3d6d9b4baf043528fdde2ab4f9f203e9f
6
+ metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
7
+ data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252
data/README.md CHANGED
@@ -1,6 +1,5 @@
1
1
  # Red Datasets
2
2
 
3
- [![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
4
3
  [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
5
4
 
6
5
  ## Description
data/Rakefile CHANGED
@@ -13,9 +13,64 @@ end
13
13
  helper.install
14
14
  spec = helper.gemspec
15
15
 
16
+ task default: :test
17
+
16
18
  desc "Run tests"
17
19
  task :test do
18
20
  ruby("test/run-test.rb")
19
21
  end
20
22
 
21
- task default: :test
23
+ desc "Generate an artifact for GitHub Pages"
24
+ task :pages do
25
+ pages_dir = "_site"
26
+ rm_rf(pages_dir)
27
+ mkdir_p(pages_dir)
28
+
29
+ require "cgi/util"
30
+ require_relative "lib/datasets/lazy"
31
+ File.open("#{pages_dir}/index.html", "w") do |index_html|
32
+ index_html.puts(<<-HTML)
33
+ <!DOCTYPE html>
34
+ <html>
35
+ <head>
36
+ <meta charset="UTF-8">
37
+ <title>Red Datasets</title>
38
+ <style>
39
+ table {
40
+ margin-left: 20vw;
41
+ min-width: 50%;
42
+ }
43
+ th {
44
+ font-size: 30px;
45
+ padding: 20px;
46
+ }
47
+ td {
48
+ border-bottom: 1px solid #D9DCE0;
49
+ padding: 20px;
50
+ font-weight: bold;
51
+ }
52
+ </style>
53
+ </head>
54
+ <body>
55
+ <section>
56
+ <h1>Red Datasets</h1>
57
+ <table>
58
+ <thead>
59
+ <tr><th>Available datasets</th></tr>
60
+ </thead>
61
+ <tbody>
62
+ HTML
63
+ Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
64
+ index_html.puts(<<-HTML)
65
+ <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
66
+ HTML
67
+ end
68
+ index_html.puts(<<-HTML)
69
+ </tbody>
70
+ </table>
71
+ </section>
72
+ </body>
73
+ </html>
74
+ HTML
75
+ end
76
+ end
data/doc/text/news.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # News
2
2
 
3
+ ## 0.1.6 - 2023-05-24
4
+
5
+ ### Improvements
6
+
7
+ * Added support for lazy loading by `require "datasets/lazy"`.
8
+
9
+ * `Datasets::NagoyaUniversityConversationCorpus`: Added.
10
+ [GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
11
+ [Patch by matsuura]
12
+
13
+ * `Datasets::Wikipedia`: Added support for downloading in background.
14
+
15
+ ### Thanks
16
+
17
+ * matsuura
18
+
3
19
  ## 0.1.5 - 2022-09-22
4
20
 
5
21
  ### Improvements
@@ -33,20 +33,59 @@ module Datasets
33
33
  @cache_path ||= CachePath.new(@metadata.id)
34
34
  end
35
35
 
36
- def download(output_path, url)
36
+ def download(output_path, url, &block)
37
37
  downloader = Downloader.new(url)
38
- downloader.download(output_path)
38
+ downloader.download(output_path, &block)
39
39
  end
40
40
 
41
- def extract_bz2(path)
42
- input, output = IO.pipe
43
- pid = spawn("bzcat", path.to_s, {:out => output})
44
- begin
45
- output.close
46
- yield(input)
47
- ensure
48
- input.close
49
- Process.waitpid(pid)
41
+ def extract_bz2(bz2)
42
+ case bz2
43
+ when Pathname, String
44
+ IO.pipe do |input, output|
45
+ pid = spawn("bzcat", bz2.to_s, {out: output})
46
+ begin
47
+ output.close
48
+ yield(input)
49
+ ensure
50
+ input.close
51
+ Process.waitpid(pid)
52
+ end
53
+ end
54
+ else
55
+ IO.pipe do |bz2_input, bz2_output|
56
+ IO.pipe do |plain_input, plain_output|
57
+ bz2_stop = false
58
+ bz2_thread = Thread.new do
59
+ begin
60
+ bz2.each do |chunk|
61
+ bz2_output.write(chunk)
62
+ bz2_output.flush
63
+ break if bz2_stop
64
+ end
65
+ rescue => error
66
+ message = "Failed to read bzcat input: " +
67
+ "#{error.class}: #{error.message}"
68
+ $stderr.puts(message)
69
+ ensure
70
+ bz2_output.close
71
+ end
72
+ end
73
+ begin
74
+ pid = spawn("bzcat", {in: bz2_input, out: plain_output})
75
+ begin
76
+ bz2_input.close
77
+ plain_output.close
78
+ yield(plain_input)
79
+ ensure
80
+ plain_input.close
81
+ Process.waitpid(pid)
82
+ end
83
+ ensure
84
+ bz2_stop = true
85
+ bz2_thread.join
86
+ end
87
+ end
88
+ end
50
89
  end
51
90
  end
52
91
  end
@@ -22,50 +22,115 @@ module Datasets
22
22
  end
23
23
  end
24
24
 
25
- def download(output_path)
26
- return if output_path.exist?
27
-
28
- output_path.parent.mkpath
25
+ def download(output_path, &block)
26
+ if output_path.exist?
27
+ yield_chunks(output_path, &block) if block_given?
28
+ return
29
+ end
29
30
 
30
- headers = {
31
- "Accept-Encoding" => "identity",
32
- "User-Agent" => "Red Datasets/#{VERSION}",
33
- }
34
- start = nil
35
31
  partial_output_path = Pathname.new("#{output_path}.partial")
36
- if partial_output_path.exist?
37
- start = partial_output_path.size
38
- headers["Range"] = "bytes=#{start}-"
39
- end
32
+ synchronize(output_path, partial_output_path) do
33
+ output_path.parent.mkpath
40
34
 
41
- start_http(@url, headers) do |response|
42
- if response.is_a?(Net::HTTPPartialContent)
43
- mode = "ab"
44
- else
35
+ n_retries = 0
36
+ n_max_retries = 5
37
+ begin
38
+ headers = {
39
+ "Accept-Encoding" => "identity",
40
+ "User-Agent" => "Red Datasets/#{VERSION}",
41
+ }
45
42
  start = nil
46
- mode = "wb"
47
- end
43
+ if partial_output_path.exist?
44
+ start = partial_output_path.size
45
+ headers["Range"] = "bytes=#{start}-"
46
+ end
47
+
48
+ start_http(@url, headers) do |response|
49
+ if response.is_a?(Net::HTTPPartialContent)
50
+ mode = "ab"
51
+ else
52
+ start = nil
53
+ mode = "wb"
54
+ end
48
55
 
49
- base_name = @url.path.split("/").last
50
- size_current = 0
51
- size_max = response.content_length
52
- if start
53
- size_current += start
54
- size_max += start
56
+ base_name = @url.path.split("/").last
57
+ size_current = 0
58
+ size_max = response.content_length
59
+ if start
60
+ size_current += start
61
+ size_max += start
62
+ if block_given? and n_retries.zero?
63
+ yield_chunks(partial_output_path, &block)
64
+ end
65
+ end
66
+ progress_reporter = ProgressReporter.new(base_name, size_max)
67
+ partial_output_path.open(mode) do |output|
68
+ response.read_body do |chunk|
69
+ size_current += chunk.bytesize
70
+ progress_reporter.report(size_current)
71
+ output.write(chunk)
72
+ yield(chunk) if block_given?
73
+ end
74
+ end
75
+ end
76
+ FileUtils.mv(partial_output_path, output_path)
77
+ rescue Net::ReadTimeout => error
78
+ n_retries += 1
79
+ retry if n_retries < n_max_retries
80
+ raise
81
+ rescue TooManyRedirects => error
82
+ last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
83
+ raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
55
84
  end
56
- progress_reporter = ProgressReporter.new(base_name, size_max)
57
- partial_output_path.open(mode) do |output|
58
- response.read_body do |chunk|
59
- size_current += chunk.bytesize
60
- progress_reporter.report(size_current)
61
- output.write(chunk)
85
+ end
86
+ end
87
+
88
+ private def synchronize(output_path, partial_output_path)
89
+ begin
90
+ Process.getpgid(Process.pid)
91
+ rescue NotImplementedError
92
+ return yield
93
+ end
94
+
95
+ lock_path = Pathname("#{output_path}.lock")
96
+ loop do
97
+ lock_path.parent.mkpath
98
+ begin
99
+ lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
100
+ rescue SystemCallError
101
+ valid_lock_path = true
102
+ begin
103
+ pid = Integer(lock_path.read.chomp, 10)
104
+ rescue ArgumentError
105
+ # The process that acquired the lock will be exited before
106
+ # it stores its process ID.
107
+ valid_lock_path = (lock_path.mtime > 10)
108
+ else
109
+ begin
110
+ Process.getpgid(pid)
111
+ rescue SystemCallError
112
+ # Process that acquired the lock doesn't exist
113
+ valid_lock_path = false
114
+ end
115
+ end
116
+ if valid_lock_path
117
+ sleep(1 + rand(10))
118
+ else
119
+ lock_path.delete
62
120
  end
121
+ retry
122
+ else
123
+ begin
124
+ lock.puts(Process.pid.to_s)
125
+ lock.flush
126
+ yield
127
+ ensure
128
+ lock.close
129
+ lock_path.delete
130
+ end
131
+ break
63
132
  end
64
133
  end
65
- FileUtils.mv(partial_output_path, output_path)
66
- rescue TooManyRedirects => error
67
- last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
68
- raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
69
134
  end
70
135
 
71
136
  private def start_http(url, headers, limit = 10, &block)
@@ -99,6 +164,16 @@ module Datasets
99
164
  end
100
165
  end
101
166
 
167
+ private def yield_chunks(path)
168
+ path.open("rb") do |output|
169
+ chunk_size = 1024 * 1024
170
+ chunk = ""
171
+ while output.read(chunk_size, chunk)
172
+ yield(chunk)
173
+ end
174
+ end
175
+ end
176
+
102
177
  class ProgressReporter
103
178
  def initialize(base_name, size_max)
104
179
  @base_name = base_name
@@ -0,0 +1,90 @@
1
+ require_relative "version"
2
+
3
+ module Datasets
4
+ class LazyLoader
5
+ def initialize
6
+ @constants = {}
7
+ end
8
+
9
+ def exist?(constant_name)
10
+ @constants.key?(constant_name)
11
+ end
12
+
13
+ def load(constant_name)
14
+ feature = @constants[constant_name]
15
+ raise LoadError, "unknown dataset: #{constant_name}" unless feature
16
+ require feature
17
+ end
18
+
19
+ def load_all
20
+ @constants.each_value do |feature|
21
+ require feature
22
+ end
23
+ end
24
+
25
+ def register(constant_name, feature)
26
+ @constants[constant_name] = feature
27
+ end
28
+
29
+ def constant_names
30
+ @constants.keys
31
+ end
32
+ end
33
+
34
+ LAZY_LOADER = LazyLoader.new
35
+
36
+ class << self
37
+ def const_missing(name)
38
+ if LAZY_LOADER.exist?(name)
39
+ LAZY_LOADER.load(name)
40
+ const_get(name)
41
+ else
42
+ super
43
+ end
44
+ end
45
+ end
46
+
47
+ LAZY_LOADER.register(:Adult, "datasets/adult")
48
+ LAZY_LOADER.register(:AFINN, "datasets/afinn")
49
+ LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
50
+ LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
51
+ LAZY_LOADER.register(:CIFAR, "datasets/cifar")
52
+ LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
53
+ LAZY_LOADER.register(:Communities, "datasets/communities")
54
+ LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
55
+ LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
56
+ LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
57
+ LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
58
+ LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
59
+ LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
60
+ LAZY_LOADER.register(:Iris, "datasets/iris")
61
+ LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
62
+ LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
63
+ LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
64
+ LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
65
+ LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
66
+ LAZY_LOADER.register(:MNIST, "datasets/mnist")
67
+ LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
68
+ LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
69
+ "datasets/nagoya-university-conversation-corpus")
70
+ LAZY_LOADER.register(:Penguins, "datasets/penguins")
71
+ LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
72
+ LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
73
+ LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
74
+ LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
75
+ "datasets/quora-duplicate-question-pair")
76
+ LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
77
+ # For backward compatibility
78
+ LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
79
+ LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
80
+ # For backward compatibility
81
+ LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
82
+ LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
83
+ LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
84
+ LAZY_LOADER.register(:SudachiSynonymDictionary,
85
+ "datasets/sudachi-synonym-dictionary")
86
+ LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
87
+ LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
88
+ "datasets/wikipedia-kyoto-japanese-english")
89
+ LAZY_LOADER.register(:Wine, "datasets/wine")
90
+ end
@@ -0,0 +1,109 @@
1
+ require_relative 'dataset'
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ class NagoyaUniversityConversationCorpus < Dataset
6
+ Data = Struct.new(
7
+ :name,
8
+ :date,
9
+ :place,
10
+ :participants,
11
+ :relationships,
12
+ :note,
13
+ :sentences
14
+ )
15
+
16
+ Participant = Struct.new(
17
+ :id,
18
+ :attribute,
19
+ :birthplace,
20
+ :residence
21
+ )
22
+
23
+ Sentence = Struct.new(:participant_id, :content) do
24
+ def end?
25
+ participant_id.nil? and content.nil?
26
+ end
27
+ end
28
+
29
+ def initialize
30
+ super()
31
+ @metadata.id = 'nagoya-university-conversation-curpus'
32
+ @metadata.name = 'Nagoya University Conversation Curpus'
33
+ @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
34
+ @metadata.licenses = ['CC-BY-NC-ND-4.0']
35
+ @metadata.description = <<~DESCRIPTION
36
+ The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
37
+ total about 100 hours of chatting among native speakers of Japanese,
38
+ which is converted into text.
39
+ DESCRIPTION
40
+ end
41
+
42
+ def each
43
+ return to_enum(__method__) unless block_given?
44
+
45
+ open_data do |input_stream|
46
+ yield(parse_file(input_stream))
47
+ end
48
+ end
49
+
50
+ private
51
+
52
+ def open_data
53
+ data_path = cache_dir_path + 'nucc.zip'
54
+ data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
55
+ download(data_path, data_url)
56
+
57
+ extractor = ZipExtractor.new(data_path)
58
+ extractor.extract_files do |input_stream|
59
+ yield(input_stream)
60
+ end
61
+ end
62
+
63
+ def parse_file(input_stream)
64
+ data = Data.new
65
+ participants = []
66
+ sentences = []
67
+
68
+ input_stream.each do |input|
69
+ input.each_line(chomp: true) do |line|
70
+ line.force_encoding('utf-8')
71
+ if line.start_with?('@データ')
72
+ data.name = line[4..]
73
+ elsif line.start_with?('@収集年月日')
74
+ # mixed cases with and without':'
75
+ data.date = line[6..].delete_prefix(':')
76
+ elsif line.start_with?('@場所')
77
+ data.place = line[4..]
78
+ elsif line.start_with?('@参加者の関係')
79
+ data.relationships = line.split(':', 2)[1]
80
+ elsif line.start_with?('@参加者')
81
+ participant = Participant.new
82
+ participant.id, profiles = line[4..].split(':', 2)
83
+ participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
84
+
85
+ participants << participant
86
+ elsif line.start_with?('%com')
87
+ data.note = line.split(':', 2)[1]
88
+ elsif line == '@END'
89
+ sentence = Sentence.new
90
+ sentence.participant_id = nil
91
+ sentence.content = nil
92
+
93
+ sentences << sentence
94
+ else
95
+ sentence = Sentence.new
96
+ sentence.participant_id, sentence.content = line.split(':', 2)
97
+
98
+ sentences << sentence
99
+ end
100
+ end
101
+ end
102
+
103
+ data.participants = participants
104
+ data.sentences = sentences
105
+
106
+ data
107
+ end
108
+ end
109
+ end
@@ -1,3 +1,5 @@
1
+ require "csv"
2
+
1
3
  require_relative "dataset"
2
4
 
3
5
  module Datasets
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -53,13 +53,22 @@ module Datasets
53
53
  end
54
54
 
55
55
  private
56
+ def base_name
57
+ "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
58
+ end
59
+
60
+ def data_path
61
+ cache_dir_path + base_name
62
+ end
63
+
56
64
  def open_data(&block)
57
- base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
58
- data_path = cache_dir_path + base_name
59
65
  data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
60
- download(data_path, data_url)
61
-
62
- extract_bz2(data_path, &block)
66
+ bz2 = Enumerator.new do |yielder|
67
+ download(data_path, data_url) do |bz2_chunk|
68
+ yielder << bz2_chunk
69
+ end
70
+ end
71
+ extract_bz2(bz2, &block)
63
72
  end
64
73
 
65
74
  def type_in_path
@@ -32,5 +32,17 @@ module Datasets
32
32
  end
33
33
  nil
34
34
  end
35
+
36
+ def extract_files
37
+ Zip::File.open(@path) do |zip_file|
38
+ zip_file.each do |entry|
39
+ next unless entry.file?
40
+
41
+ entry.get_input_stream do |input|
42
+ yield(input)
43
+ end
44
+ end
45
+ end
46
+ end
35
47
  end
36
48
  end
data/lib/datasets.rb CHANGED
@@ -1,34 +1,2 @@
1
- require_relative "datasets/version"
2
-
3
- require_relative "datasets/adult"
4
- require_relative "datasets/afinn"
5
- require_relative "datasets/aozora-bunko"
6
- require_relative "datasets/california-housing"
7
- require_relative "datasets/cifar"
8
- require_relative "datasets/cldr-plurals"
9
- require_relative "datasets/communities"
10
- require_relative "datasets/diamonds"
11
- require_relative "datasets/e-stat-japan"
12
- require_relative "datasets/fashion-mnist"
13
- require_relative "datasets/fuel-economy"
14
- require_relative "datasets/geolonia"
15
- require_relative "datasets/hepatitis"
16
- require_relative "datasets/iris"
17
- require_relative "datasets/ita-corpus"
18
- require_relative "datasets/kuzushiji-mnist"
19
- require_relative "datasets/libsvm"
20
- require_relative "datasets/libsvm-dataset-list"
21
- require_relative "datasets/livedoor-news"
22
- require_relative "datasets/mnist"
23
- require_relative "datasets/mushroom"
24
- require_relative "datasets/penguins"
25
- require_relative "datasets/penn-treebank"
26
- require_relative "datasets/pmjt-dataset-list"
27
- require_relative "datasets/postal-code-japan"
28
- require_relative "datasets/quora-duplicate-question-pair"
29
- require_relative "datasets/rdataset"
30
- require_relative "datasets/seaborn"
31
- require_relative "datasets/sudachi-synonym-dictionary"
32
- require_relative "datasets/wikipedia"
33
- require_relative "datasets/wikipedia-kyoto-japanese-english"
34
- require_relative "datasets/wine"
1
+ require_relative "datasets/lazy"
2
+ Datasets::LAZY_LOADER.load_all
@@ -6,7 +6,7 @@ class GeoloniaTest < Test::Unit::TestCase
6
6
  test('#each') do
7
7
  records = @dataset.each.to_a
8
8
  assert_equal([
9
- 277191,
9
+ 277616,
10
10
  {
11
11
  :prefecture_code => "01",
12
12
  :prefecture_name => "北海道",
@@ -28,16 +28,16 @@ class GeoloniaTest < Test::Unit::TestCase
28
28
  :prefecture_name => "沖縄県",
29
29
  :prefecture_kana => "オキナワケン",
30
30
  :prefecture_romaji => "OKINAWA KEN",
31
- :municipality_code => "47325",
32
- :municipality_name => "中頭郡嘉手納町",
33
- :municipality_kana => "ナカガミグンカデナチョウ",
34
- :municipality_romaji => "NAKAGAMI GUN KADENA CHO",
35
- :street_name => "字兼久",
31
+ :municipality_code => "47382",
32
+ :municipality_name => "八重山郡与那国町",
33
+ :municipality_kana => "ヤエヤマグンヨナグニチョウ",
34
+ :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
35
+ :street_name => "字与那国",
36
36
  :street_kana => nil,
37
37
  :street_romaji => nil,
38
- :alias => "下原",
39
- :latitude => "26.351841",
40
- :longitude => "127.744975",
38
+ :alias => nil,
39
+ :latitude => "24.455925",
40
+ :longitude => "122.987678",
41
41
  },
42
42
  ],
43
43
  [
@@ -55,6 +55,7 @@ class GeoloniaTest < Test::Unit::TestCase
55
55
  "## 住所データ仕様",
56
56
  "### ファイルフォーマット",
57
57
  "### 列",
58
+ "### ソート順",
58
59
  ],
59
60
  description.scan(/^#.*$/),
60
61
  description)
@@ -0,0 +1,132 @@
1
+ class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::NagoyaUniversityConversationCorpus.new
4
+ end
5
+
6
+ sub_test_case("each") do
7
+ test("#sentences") do
8
+ records = @dataset.each.to_a
9
+ first_sentences = records[0].sentences
10
+ last_sentences = records[-1].sentences
11
+ assert_equal([
12
+ 856,
13
+ {
14
+ participant_id: 'F107',
15
+ content: '***の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても1時間ぐらいですよね。'
16
+ },
17
+ {
18
+ participant_id: nil,
19
+ content: nil
20
+ },
21
+ 603,
22
+ {
23
+ participant_id: 'F007',
24
+ content: 'それでは話を始めまーす。'
25
+ },
26
+ {
27
+ participant_id: nil,
28
+ content: nil
29
+ }
30
+ ],
31
+ [
32
+ first_sentences.size,
33
+ first_sentences[0].to_h,
34
+ first_sentences[-1].to_h,
35
+ last_sentences.size,
36
+ last_sentences[0].to_h,
37
+ last_sentences[-1].to_h,
38
+ ])
39
+ end
40
+
41
+ test("#participants") do
42
+ records = @dataset.each.to_a
43
+ first_participants = records[0].participants
44
+ last_participants = records[-1].participants
45
+ assert_equal([
46
+ 4,
47
+ {
48
+ id: 'F107',
49
+ attribute: '女性30代後半',
50
+ birthplace: '愛知県幡豆郡出身',
51
+ residence: '愛知県幡豆郡在住'
52
+ },
53
+ {
54
+ id: 'F128',
55
+ attribute: '女性20代前半',
56
+ birthplace: '愛知県西尾市出身',
57
+ residence: '西尾市在住'
58
+ },
59
+ 2,
60
+ {
61
+ id: 'F007',
62
+ attribute: '女性50代後半',
63
+ birthplace: '東京都出身',
64
+ residence: '東京都国分寺市在住'
65
+ },
66
+ {
67
+ id: 'F003',
68
+ attribute: '女性80代後半',
69
+ birthplace: '栃木県宇都宮市出身',
70
+ residence: '国分寺市在住'
71
+ }
72
+ ],
73
+ [
74
+ first_participants.size,
75
+ first_participants[0].to_h,
76
+ first_participants[-1].to_h,
77
+ last_participants.size,
78
+ last_participants[0].to_h,
79
+ last_participants[-1].to_h
80
+ ])
81
+ end
82
+
83
+ test("others") do
84
+ records = @dataset.each.to_a
85
+ assert_equal([
86
+ 129,
87
+ [
88
+ '1(約35分)',
89
+ '2001年10月16日',
90
+ 'ファミリーレストラン',
91
+ '英会話教室の友人',
92
+ nil
93
+ ],
94
+ [
95
+ '129(36分)',
96
+ '2003年2月16日',
97
+ '二人の自宅',
98
+ '母と娘',
99
+ 'F007は東京に38年、F003は東京に60年居住。'
100
+ ]
101
+ ],
102
+ [
103
+ records.size,
104
+ [
105
+ records[0].name,
106
+ records[0].date,
107
+ records[0].place,
108
+ records[0].relationships,
109
+ records[0].note
110
+ ],
111
+ [
112
+ records[-1].name,
113
+ records[-1].date,
114
+ records[-1].place,
115
+ records[-1].relationships,
116
+ records[-1].note
117
+ ]
118
+ ])
119
+ end
120
+ end
121
+
122
+ sub_test_case("#metadata") do
123
+ test("#description") do
124
+ description = @dataset.metadata.description
125
+ assert_equal(<<~DESCRIPTION, description)
126
+ The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
127
+ total about 100 hours of chatting among native speakers of Japanese,
128
+ which is converted into text.
129
+ DESCRIPTION
130
+ end
131
+ end
132
+ end
@@ -8,7 +8,7 @@ class RdatasetTest < Test::Unit::TestCase
8
8
  test("with package_name") do
9
9
  records = @dataset.filter(package: "datasets").to_a
10
10
  assert_equal([
11
- 84,
11
+ 102,
12
12
  {
13
13
  package: "datasets",
14
14
  dataset: "ability.cov",
@@ -48,7 +48,7 @@ class RdatasetTest < Test::Unit::TestCase
48
48
  test("without package_name") do
49
49
  records = @dataset.each.to_a
50
50
  assert_equal([
51
- 1892,
51
+ 2142,
52
52
  {
53
53
  package: "AER",
54
54
  dataset: "Affairs",
data/test/test-seaborn.rb CHANGED
@@ -14,6 +14,7 @@ class SeabornTest < Test::Unit::TestCase
14
14
  {dataset: "car_crashes"},
15
15
  {dataset: "diamonds"},
16
16
  {dataset: "dots"},
17
+ {dataset: "dowjones"},
17
18
  {dataset: "exercise"},
18
19
  {dataset: "flights"},
19
20
  {dataset: "fmri"},
@@ -6,7 +6,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
6
6
  test('#each') do
7
7
  records = @dataset.each.to_a
8
8
  assert_equal([
9
- 65182,
9
+ 65206,
10
10
  {
11
11
  group_id: "000001",
12
12
  is_noun: true,
@@ -19,7 +19,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
19
19
  notation: "曖昧",
20
20
  },
21
21
  {
22
- group_id: "024909",
22
+ group_id: "024916",
23
23
  is_noun: true,
24
24
  expansion_type: :expanded,
25
25
  lexeme_id: 1,
@@ -27,7 +27,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
27
27
  acronym_type: :alphabet,
28
28
  variant_type: :typical,
29
29
  categories: ["ビジネス"],
30
- notation: "BPO",
30
+ notation: "SCM",
31
31
  },
32
32
  ],
33
33
  [
@@ -1,100 +1,54 @@
1
1
  class WikipediaTest < Test::Unit::TestCase
2
- sub_test_case("ja") do
2
+ sub_test_case("en") do
3
3
  sub_test_case("articles") do
4
- include Helper::Sandbox
5
-
6
4
  def setup
7
- setup_sandbox
8
- @dataset = Datasets::Wikipedia.new(language: :ja,
5
+ @dataset = Datasets::Wikipedia.new(language: :en,
9
6
  type: :articles)
10
- def @dataset.cache_dir_path
11
- @cache_dir_path
12
- end
13
- def @dataset.cache_dir_path=(path)
14
- @cache_dir_path = path
15
- end
16
- @dataset.cache_dir_path = @tmp_dir
17
- end
18
-
19
- def teardown
20
- teardown_sandbox
21
7
  end
22
8
 
23
9
  test("#each") do
24
- def @dataset.download(output_path, url)
25
- xml_path = output_path.sub_ext("")
26
- xml_path.open("w") do |xml_file|
27
- xml_file.puts(<<-XML)
28
- <mediawiki
29
- xmlns="http://www.mediawiki.org/xml/export-0.10/"
30
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
31
- xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
32
- version="0.10" xml:lang="ja">
33
- <siteinfo>
34
- <sitename>Wikipedia</sitename>
35
- </siteinfo>
36
- <page>
37
- <title>タイトル</title>
38
- <ns>4</ns>
39
- <id>1</id>
40
- <restrictions>sysop</restrictions>
41
- <revision>
42
- <id>3</id>
43
- <parentid>2</parentid>
44
- <timestamp>2004-04-30T14:46:00Z</timestamp>
45
- <contributor>
46
- <username>user</username>
47
- <id>10</id>
48
- </contributor>
49
- <minor />
50
- <comment>コメント</comment>
51
- <model>wikitext</model>
52
- <format>text/x-wiki</format>
53
- <text xml:space="preserve">テキスト</text>
54
- <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
55
- </revision>
56
- </page>
57
- </mediawiki>
58
- XML
59
- end
60
- unless system("bzip2", xml_path.to_s)
61
- raise "failed to run bzip2"
62
- end
63
- end
64
-
65
- contributor = Datasets::Wikipedia::Contributor.new("user", 10)
10
+ contributor = Datasets::Wikipedia::Contributor.new("Elli", 20842734)
66
11
  revision = Datasets::Wikipedia::Revision.new
67
- revision.id = 3
68
- revision.parent_id = 2
69
- revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
12
+ revision.id = 1002250816
13
+ revision.parent_id = 854851586
14
+ revision.timestamp = Time.iso8601("2021-01-23T15:15:01Z")
70
15
  revision.contributor = contributor
71
- revision.comment = "コメント"
16
+ revision.comment = "shel"
72
17
  revision.model = "wikitext"
73
18
  revision.format = "text/x-wiki"
74
- revision.text = "テキスト"
75
- revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
19
+ revision.text = <<-TEXT.chomp
20
+ #REDIRECT [[Computer accessibility]]
21
+
22
+ {{rcat shell|
23
+ {{R from move}}
24
+ {{R from CamelCase}}
25
+ {{R unprintworthy}}
26
+ }}
27
+ TEXT
28
+ revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
76
29
  page = Datasets::Wikipedia::Page.new
77
- page.title = "タイトル"
78
- page.namespace = 4
79
- page.id = 1
80
- page.restrictions = ["sysop"]
30
+ page.title = "AccessibleComputing"
31
+ page.namespace = 0
32
+ page.id = 10
33
+ page.restrictions = nil
34
+ page.redirect = "Computer accessibility"
81
35
  page.revision = revision
82
36
  assert_equal(page, @dataset.each.first)
83
37
  end
84
38
 
85
39
  sub_test_case("#metadata") do
86
40
  test("#id") do
87
- assert_equal("wikipedia-ja-articles",
41
+ assert_equal("wikipedia-en-articles",
88
42
  @dataset.metadata.id)
89
43
  end
90
44
 
91
45
  test("#name") do
92
- assert_equal("Wikipedia articles (ja)",
46
+ assert_equal("Wikipedia articles (en)",
93
47
  @dataset.metadata.name)
94
48
  end
95
49
 
96
50
  test("#description") do
97
- assert_equal("Wikipedia articles in ja",
51
+ assert_equal("Wikipedia articles in en",
98
52
  @dataset.metadata.description)
99
53
  end
100
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2022-09-23 00:00:00.000000000 Z
12
+ date: 2023-05-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -163,6 +163,7 @@ files:
163
163
  - lib/datasets/iris.rb
164
164
  - lib/datasets/ita-corpus.rb
165
165
  - lib/datasets/kuzushiji-mnist.rb
166
+ - lib/datasets/lazy.rb
166
167
  - lib/datasets/libsvm-dataset-list.rb
167
168
  - lib/datasets/libsvm.rb
168
169
  - lib/datasets/license.rb
@@ -170,6 +171,7 @@ files:
170
171
  - lib/datasets/metadata.rb
171
172
  - lib/datasets/mnist.rb
172
173
  - lib/datasets/mushroom.rb
174
+ - lib/datasets/nagoya-university-conversation-corpus.rb
173
175
  - lib/datasets/penguins.rb
174
176
  - lib/datasets/penn-treebank.rb
175
177
  - lib/datasets/pmjt-dataset-list.rb
@@ -214,6 +216,7 @@ files:
214
216
  - test/test-metadata.rb
215
217
  - test/test-mnist.rb
216
218
  - test/test-mushroom.rb
219
+ - test/test-nagoya-university-conversation-corpus.rb
217
220
  - test/test-penguins.rb
218
221
  - test/test-penn-treebank.rb
219
222
  - test/test-pmjt-dataset-list.rb
@@ -245,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
245
248
  - !ruby/object:Gem::Version
246
249
  version: '0'
247
250
  requirements: []
248
- rubygems_version: 3.2.32
251
+ rubygems_version: 3.5.0.dev
249
252
  signing_key:
250
253
  specification_version: 4
251
254
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
@@ -278,6 +281,7 @@ test_files:
278
281
  - test/test-metadata.rb
279
282
  - test/test-mnist.rb
280
283
  - test/test-mushroom.rb
284
+ - test/test-nagoya-university-conversation-corpus.rb
281
285
  - test/test-penguins.rb
282
286
  - test/test-penn-treebank.rb
283
287
  - test/test-pmjt-dataset-list.rb