red-datasets 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +0 -1
- data/Rakefile +56 -1
- data/doc/text/news.md +23 -0
- data/lib/datasets/dataset.rb +50 -11
- data/lib/datasets/downloader.rb +110 -35
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +2 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +14 -5
- data/lib/datasets/zip-extractor.rb +12 -0
- data/lib/datasets.rb +2 -34
- data/test/test-geolonia.rb +10 -9
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-rdataset.rb +2 -2
- data/test/test-seaborn.rb +1 -0
- data/test/test-sudachi-synonym-dictionary.rb +3 -3
- data/test/test-wikipedia.rb +25 -71
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0231a4f9da16ad1b2cb562a360468b3450fec684e2bcf0ca500195499e8f7397
|
4
|
+
data.tar.gz: c2938b6d72fea58413a743ccad111fc8c95699d9b417e64941ca1681128a1706
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ea402beb78be117e28ca906490526a28a8d1e1430181a89432953d26eb0b0a8ea70d0c582d438c5e6b668b3a48eb60db336ef8174fa4dd1a52e20c19f5b4d9b
|
7
|
+
data.tar.gz: 38b59c46e875ae61ab0794f2f9f474969b7e4e08e06078ca55ce4d1930d9538647e6b0152c1c48e8f0d45318ed2f71801dc99152d0799e06459937fb3d29978d
|
data/README.md
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# Red Datasets
|
2
2
|
|
3
|
-
[![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
|
4
3
|
[![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
|
5
4
|
|
6
5
|
## Description
|
data/Rakefile
CHANGED
@@ -13,9 +13,64 @@ end
|
|
13
13
|
helper.install
|
14
14
|
spec = helper.gemspec
|
15
15
|
|
16
|
+
task default: :test
|
17
|
+
|
16
18
|
desc "Run tests"
|
17
19
|
task :test do
|
18
20
|
ruby("test/run-test.rb")
|
19
21
|
end
|
20
22
|
|
21
|
-
|
23
|
+
desc "Generate an artifact for GitHub Pages"
|
24
|
+
task :pages do
|
25
|
+
pages_dir = "_site"
|
26
|
+
rm_rf(pages_dir)
|
27
|
+
mkdir_p(pages_dir)
|
28
|
+
|
29
|
+
require "cgi/util"
|
30
|
+
require_relative "lib/datasets/lazy"
|
31
|
+
File.open("#{pages_dir}/index.html", "w") do |index_html|
|
32
|
+
index_html.puts(<<-HTML)
|
33
|
+
<!DOCTYPE html>
|
34
|
+
<html>
|
35
|
+
<head>
|
36
|
+
<meta charset="UTF-8">
|
37
|
+
<title>Red Datasets</title>
|
38
|
+
<style>
|
39
|
+
table {
|
40
|
+
margin-left: 20vw;
|
41
|
+
min-width: 50%;
|
42
|
+
}
|
43
|
+
th {
|
44
|
+
font-size: 30px;
|
45
|
+
padding: 20px;
|
46
|
+
}
|
47
|
+
td {
|
48
|
+
border-bottom: 1px solid #D9DCE0;
|
49
|
+
padding: 20px;
|
50
|
+
font-weight: bold;
|
51
|
+
}
|
52
|
+
</style>
|
53
|
+
</head>
|
54
|
+
<body>
|
55
|
+
<section>
|
56
|
+
<h1>Red Datasets</h1>
|
57
|
+
<table>
|
58
|
+
<thead>
|
59
|
+
<tr><th>Available datasets</th></tr>
|
60
|
+
</thead>
|
61
|
+
<tbody>
|
62
|
+
HTML
|
63
|
+
Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
|
64
|
+
index_html.puts(<<-HTML)
|
65
|
+
<tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
|
66
|
+
HTML
|
67
|
+
end
|
68
|
+
index_html.puts(<<-HTML)
|
69
|
+
</tbody>
|
70
|
+
</table>
|
71
|
+
</section>
|
72
|
+
</body>
|
73
|
+
</html>
|
74
|
+
HTML
|
75
|
+
end
|
76
|
+
end
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,28 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.7 - 2023-05-29
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::NagoyaUniversityConversationCorpus`: Avoid using
|
8
|
+
endless range for old Ruby.
|
9
|
+
|
10
|
+
## 0.1.6 - 2023-05-24
|
11
|
+
|
12
|
+
### Improvements
|
13
|
+
|
14
|
+
* Added support for lazy loading by `require "datasets/lazy"`.
|
15
|
+
|
16
|
+
* `Datasets::NagoyaUniversityConversationCorpus`: Added.
|
17
|
+
[GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
|
18
|
+
[Patch by matsuura]
|
19
|
+
|
20
|
+
* `Datasets::Wikipedia`: Added support for downloading in background.
|
21
|
+
|
22
|
+
### Thanks
|
23
|
+
|
24
|
+
* matsuura
|
25
|
+
|
3
26
|
## 0.1.5 - 2022-09-22
|
4
27
|
|
5
28
|
### Improvements
|
data/lib/datasets/dataset.rb
CHANGED
@@ -33,20 +33,59 @@ module Datasets
|
|
33
33
|
@cache_path ||= CachePath.new(@metadata.id)
|
34
34
|
end
|
35
35
|
|
36
|
-
def download(output_path, url)
|
36
|
+
def download(output_path, url, &block)
|
37
37
|
downloader = Downloader.new(url)
|
38
|
-
downloader.download(output_path)
|
38
|
+
downloader.download(output_path, &block)
|
39
39
|
end
|
40
40
|
|
41
|
-
def extract_bz2(
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
41
|
+
def extract_bz2(bz2)
|
42
|
+
case bz2
|
43
|
+
when Pathname, String
|
44
|
+
IO.pipe do |input, output|
|
45
|
+
pid = spawn("bzcat", bz2.to_s, {out: output})
|
46
|
+
begin
|
47
|
+
output.close
|
48
|
+
yield(input)
|
49
|
+
ensure
|
50
|
+
input.close
|
51
|
+
Process.waitpid(pid)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
else
|
55
|
+
IO.pipe do |bz2_input, bz2_output|
|
56
|
+
IO.pipe do |plain_input, plain_output|
|
57
|
+
bz2_stop = false
|
58
|
+
bz2_thread = Thread.new do
|
59
|
+
begin
|
60
|
+
bz2.each do |chunk|
|
61
|
+
bz2_output.write(chunk)
|
62
|
+
bz2_output.flush
|
63
|
+
break if bz2_stop
|
64
|
+
end
|
65
|
+
rescue => error
|
66
|
+
message = "Failed to read bzcat input: " +
|
67
|
+
"#{error.class}: #{error.message}"
|
68
|
+
$stderr.puts(message)
|
69
|
+
ensure
|
70
|
+
bz2_output.close
|
71
|
+
end
|
72
|
+
end
|
73
|
+
begin
|
74
|
+
pid = spawn("bzcat", {in: bz2_input, out: plain_output})
|
75
|
+
begin
|
76
|
+
bz2_input.close
|
77
|
+
plain_output.close
|
78
|
+
yield(plain_input)
|
79
|
+
ensure
|
80
|
+
plain_input.close
|
81
|
+
Process.waitpid(pid)
|
82
|
+
end
|
83
|
+
ensure
|
84
|
+
bz2_stop = true
|
85
|
+
bz2_thread.join
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
50
89
|
end
|
51
90
|
end
|
52
91
|
end
|
data/lib/datasets/downloader.rb
CHANGED
@@ -22,50 +22,115 @@ module Datasets
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def download(output_path)
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
def download(output_path, &block)
|
26
|
+
if output_path.exist?
|
27
|
+
yield_chunks(output_path, &block) if block_given?
|
28
|
+
return
|
29
|
+
end
|
29
30
|
|
30
|
-
headers = {
|
31
|
-
"Accept-Encoding" => "identity",
|
32
|
-
"User-Agent" => "Red Datasets/#{VERSION}",
|
33
|
-
}
|
34
|
-
start = nil
|
35
31
|
partial_output_path = Pathname.new("#{output_path}.partial")
|
36
|
-
|
37
|
-
|
38
|
-
headers["Range"] = "bytes=#{start}-"
|
39
|
-
end
|
32
|
+
synchronize(output_path, partial_output_path) do
|
33
|
+
output_path.parent.mkpath
|
40
34
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
35
|
+
n_retries = 0
|
36
|
+
n_max_retries = 5
|
37
|
+
begin
|
38
|
+
headers = {
|
39
|
+
"Accept-Encoding" => "identity",
|
40
|
+
"User-Agent" => "Red Datasets/#{VERSION}",
|
41
|
+
}
|
45
42
|
start = nil
|
46
|
-
|
47
|
-
|
43
|
+
if partial_output_path.exist?
|
44
|
+
start = partial_output_path.size
|
45
|
+
headers["Range"] = "bytes=#{start}-"
|
46
|
+
end
|
47
|
+
|
48
|
+
start_http(@url, headers) do |response|
|
49
|
+
if response.is_a?(Net::HTTPPartialContent)
|
50
|
+
mode = "ab"
|
51
|
+
else
|
52
|
+
start = nil
|
53
|
+
mode = "wb"
|
54
|
+
end
|
48
55
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
56
|
+
base_name = @url.path.split("/").last
|
57
|
+
size_current = 0
|
58
|
+
size_max = response.content_length
|
59
|
+
if start
|
60
|
+
size_current += start
|
61
|
+
size_max += start
|
62
|
+
if block_given? and n_retries.zero?
|
63
|
+
yield_chunks(partial_output_path, &block)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
progress_reporter = ProgressReporter.new(base_name, size_max)
|
67
|
+
partial_output_path.open(mode) do |output|
|
68
|
+
response.read_body do |chunk|
|
69
|
+
size_current += chunk.bytesize
|
70
|
+
progress_reporter.report(size_current)
|
71
|
+
output.write(chunk)
|
72
|
+
yield(chunk) if block_given?
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
FileUtils.mv(partial_output_path, output_path)
|
77
|
+
rescue Net::ReadTimeout => error
|
78
|
+
n_retries += 1
|
79
|
+
retry if n_retries < n_max_retries
|
80
|
+
raise
|
81
|
+
rescue TooManyRedirects => error
|
82
|
+
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
83
|
+
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
55
84
|
end
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private def synchronize(output_path, partial_output_path)
|
89
|
+
begin
|
90
|
+
Process.getpgid(Process.pid)
|
91
|
+
rescue NotImplementedError
|
92
|
+
return yield
|
93
|
+
end
|
94
|
+
|
95
|
+
lock_path = Pathname("#{output_path}.lock")
|
96
|
+
loop do
|
97
|
+
lock_path.parent.mkpath
|
98
|
+
begin
|
99
|
+
lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
|
100
|
+
rescue SystemCallError
|
101
|
+
valid_lock_path = true
|
102
|
+
begin
|
103
|
+
pid = Integer(lock_path.read.chomp, 10)
|
104
|
+
rescue ArgumentError
|
105
|
+
# The process that acquired the lock will be exited before
|
106
|
+
# it stores its process ID.
|
107
|
+
valid_lock_path = (lock_path.mtime > 10)
|
108
|
+
else
|
109
|
+
begin
|
110
|
+
Process.getpgid(pid)
|
111
|
+
rescue SystemCallError
|
112
|
+
# Process that acquired the lock doesn't exist
|
113
|
+
valid_lock_path = false
|
114
|
+
end
|
115
|
+
end
|
116
|
+
if valid_lock_path
|
117
|
+
sleep(1 + rand(10))
|
118
|
+
else
|
119
|
+
lock_path.delete
|
62
120
|
end
|
121
|
+
retry
|
122
|
+
else
|
123
|
+
begin
|
124
|
+
lock.puts(Process.pid.to_s)
|
125
|
+
lock.flush
|
126
|
+
yield
|
127
|
+
ensure
|
128
|
+
lock.close
|
129
|
+
lock_path.delete
|
130
|
+
end
|
131
|
+
break
|
63
132
|
end
|
64
133
|
end
|
65
|
-
FileUtils.mv(partial_output_path, output_path)
|
66
|
-
rescue TooManyRedirects => error
|
67
|
-
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
68
|
-
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
69
134
|
end
|
70
135
|
|
71
136
|
private def start_http(url, headers, limit = 10, &block)
|
@@ -99,6 +164,16 @@ module Datasets
|
|
99
164
|
end
|
100
165
|
end
|
101
166
|
|
167
|
+
private def yield_chunks(path)
|
168
|
+
path.open("rb") do |output|
|
169
|
+
chunk_size = 1024 * 1024
|
170
|
+
chunk = ""
|
171
|
+
while output.read(chunk_size, chunk)
|
172
|
+
yield(chunk)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
102
177
|
class ProgressReporter
|
103
178
|
def initialize(base_name, size_max)
|
104
179
|
@base_name = base_name
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require_relative "version"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class LazyLoader
|
5
|
+
def initialize
|
6
|
+
@constants = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def exist?(constant_name)
|
10
|
+
@constants.key?(constant_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
def load(constant_name)
|
14
|
+
feature = @constants[constant_name]
|
15
|
+
raise LoadError, "unknown dataset: #{constant_name}" unless feature
|
16
|
+
require feature
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_all
|
20
|
+
@constants.each_value do |feature|
|
21
|
+
require feature
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def register(constant_name, feature)
|
26
|
+
@constants[constant_name] = feature
|
27
|
+
end
|
28
|
+
|
29
|
+
def constant_names
|
30
|
+
@constants.keys
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
LAZY_LOADER = LazyLoader.new
|
35
|
+
|
36
|
+
class << self
|
37
|
+
def const_missing(name)
|
38
|
+
if LAZY_LOADER.exist?(name)
|
39
|
+
LAZY_LOADER.load(name)
|
40
|
+
const_get(name)
|
41
|
+
else
|
42
|
+
super
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
LAZY_LOADER.register(:Adult, "datasets/adult")
|
48
|
+
LAZY_LOADER.register(:AFINN, "datasets/afinn")
|
49
|
+
LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
|
50
|
+
LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
|
51
|
+
LAZY_LOADER.register(:CIFAR, "datasets/cifar")
|
52
|
+
LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
|
53
|
+
LAZY_LOADER.register(:Communities, "datasets/communities")
|
54
|
+
LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
|
55
|
+
LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
|
56
|
+
LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
|
57
|
+
LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
|
58
|
+
LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
|
59
|
+
LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
|
60
|
+
LAZY_LOADER.register(:Iris, "datasets/iris")
|
61
|
+
LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
|
62
|
+
LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
|
63
|
+
LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
|
64
|
+
LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
|
65
|
+
LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
|
66
|
+
LAZY_LOADER.register(:MNIST, "datasets/mnist")
|
67
|
+
LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
|
68
|
+
LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
|
69
|
+
"datasets/nagoya-university-conversation-corpus")
|
70
|
+
LAZY_LOADER.register(:Penguins, "datasets/penguins")
|
71
|
+
LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
|
72
|
+
LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
|
73
|
+
LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
|
74
|
+
LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
|
75
|
+
"datasets/quora-duplicate-question-pair")
|
76
|
+
LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
|
77
|
+
# For backward compatibility
|
78
|
+
LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
|
79
|
+
LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
|
80
|
+
# For backward compatibility
|
81
|
+
LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
|
82
|
+
LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
|
83
|
+
LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
|
84
|
+
LAZY_LOADER.register(:SudachiSynonymDictionary,
|
85
|
+
"datasets/sudachi-synonym-dictionary")
|
86
|
+
LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
|
87
|
+
LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
|
88
|
+
"datasets/wikipedia-kyoto-japanese-english")
|
89
|
+
LAZY_LOADER.register(:Wine, "datasets/wine")
|
90
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class NagoyaUniversityConversationCorpus < Dataset
|
6
|
+
Data = Struct.new(
|
7
|
+
:name,
|
8
|
+
:date,
|
9
|
+
:place,
|
10
|
+
:participants,
|
11
|
+
:relationships,
|
12
|
+
:note,
|
13
|
+
:sentences
|
14
|
+
)
|
15
|
+
|
16
|
+
Participant = Struct.new(
|
17
|
+
:id,
|
18
|
+
:attribute,
|
19
|
+
:birthplace,
|
20
|
+
:residence
|
21
|
+
)
|
22
|
+
|
23
|
+
Sentence = Struct.new(:participant_id, :content) do
|
24
|
+
def end?
|
25
|
+
participant_id.nil? and content.nil?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
super()
|
31
|
+
@metadata.id = 'nagoya-university-conversation-curpus'
|
32
|
+
@metadata.name = 'Nagoya University Conversation Curpus'
|
33
|
+
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
|
34
|
+
@metadata.licenses = ['CC-BY-NC-ND-4.0']
|
35
|
+
@metadata.description = <<~DESCRIPTION
|
36
|
+
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
|
37
|
+
total about 100 hours of chatting among native speakers of Japanese,
|
38
|
+
which is converted into text.
|
39
|
+
DESCRIPTION
|
40
|
+
end
|
41
|
+
|
42
|
+
def each
|
43
|
+
return to_enum(__method__) unless block_given?
|
44
|
+
|
45
|
+
open_data do |input_stream|
|
46
|
+
yield(parse_file(input_stream))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def open_data
|
53
|
+
data_path = cache_dir_path + 'nucc.zip'
|
54
|
+
data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
|
55
|
+
download(data_path, data_url)
|
56
|
+
|
57
|
+
extractor = ZipExtractor.new(data_path)
|
58
|
+
extractor.extract_files do |input_stream|
|
59
|
+
yield(input_stream)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_file(input_stream)
|
64
|
+
data = Data.new
|
65
|
+
participants = []
|
66
|
+
sentences = []
|
67
|
+
|
68
|
+
input_stream.each do |input|
|
69
|
+
input.each_line(chomp: true) do |line|
|
70
|
+
line.force_encoding('utf-8')
|
71
|
+
if line.start_with?('@データ')
|
72
|
+
data.name = line[4..-1]
|
73
|
+
elsif line.start_with?('@収集年月日')
|
74
|
+
# mixed cases with and without':'
|
75
|
+
data.date = line[6..-1].delete_prefix(':')
|
76
|
+
elsif line.start_with?('@場所')
|
77
|
+
data.place = line[4..-1]
|
78
|
+
elsif line.start_with?('@参加者の関係')
|
79
|
+
data.relationships = line.split(':', 2)[1]
|
80
|
+
elsif line.start_with?('@参加者')
|
81
|
+
participant = Participant.new
|
82
|
+
participant.id, profiles = line[4..-1].split(':', 2)
|
83
|
+
participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
|
84
|
+
|
85
|
+
participants << participant
|
86
|
+
elsif line.start_with?('%com')
|
87
|
+
data.note = line.split(':', 2)[1]
|
88
|
+
elsif line == '@END'
|
89
|
+
sentence = Sentence.new
|
90
|
+
sentence.participant_id = nil
|
91
|
+
sentence.content = nil
|
92
|
+
|
93
|
+
sentences << sentence
|
94
|
+
else
|
95
|
+
sentence = Sentence.new
|
96
|
+
sentence.participant_id, sentence.content = line.split(':', 2)
|
97
|
+
|
98
|
+
sentences << sentence
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
data.participants = participants
|
104
|
+
data.sentences = sentences
|
105
|
+
|
106
|
+
data
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
data/lib/datasets/penguins.rb
CHANGED
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -53,13 +53,22 @@ module Datasets
|
|
53
53
|
end
|
54
54
|
|
55
55
|
private
|
56
|
+
def base_name
|
57
|
+
"#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
58
|
+
end
|
59
|
+
|
60
|
+
def data_path
|
61
|
+
cache_dir_path + base_name
|
62
|
+
end
|
63
|
+
|
56
64
|
def open_data(&block)
|
57
|
-
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
58
|
-
data_path = cache_dir_path + base_name
|
59
65
|
data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
|
60
|
-
|
61
|
-
|
62
|
-
|
66
|
+
bz2 = Enumerator.new do |yielder|
|
67
|
+
download(data_path, data_url) do |bz2_chunk|
|
68
|
+
yielder << bz2_chunk
|
69
|
+
end
|
70
|
+
end
|
71
|
+
extract_bz2(bz2, &block)
|
63
72
|
end
|
64
73
|
|
65
74
|
def type_in_path
|
@@ -32,5 +32,17 @@ module Datasets
|
|
32
32
|
end
|
33
33
|
nil
|
34
34
|
end
|
35
|
+
|
36
|
+
def extract_files
|
37
|
+
Zip::File.open(@path) do |zip_file|
|
38
|
+
zip_file.each do |entry|
|
39
|
+
next unless entry.file?
|
40
|
+
|
41
|
+
entry.get_input_stream do |input|
|
42
|
+
yield(input)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
35
47
|
end
|
36
48
|
end
|
data/lib/datasets.rb
CHANGED
@@ -1,34 +1,2 @@
|
|
1
|
-
require_relative "datasets/
|
2
|
-
|
3
|
-
require_relative "datasets/adult"
|
4
|
-
require_relative "datasets/afinn"
|
5
|
-
require_relative "datasets/aozora-bunko"
|
6
|
-
require_relative "datasets/california-housing"
|
7
|
-
require_relative "datasets/cifar"
|
8
|
-
require_relative "datasets/cldr-plurals"
|
9
|
-
require_relative "datasets/communities"
|
10
|
-
require_relative "datasets/diamonds"
|
11
|
-
require_relative "datasets/e-stat-japan"
|
12
|
-
require_relative "datasets/fashion-mnist"
|
13
|
-
require_relative "datasets/fuel-economy"
|
14
|
-
require_relative "datasets/geolonia"
|
15
|
-
require_relative "datasets/hepatitis"
|
16
|
-
require_relative "datasets/iris"
|
17
|
-
require_relative "datasets/ita-corpus"
|
18
|
-
require_relative "datasets/kuzushiji-mnist"
|
19
|
-
require_relative "datasets/libsvm"
|
20
|
-
require_relative "datasets/libsvm-dataset-list"
|
21
|
-
require_relative "datasets/livedoor-news"
|
22
|
-
require_relative "datasets/mnist"
|
23
|
-
require_relative "datasets/mushroom"
|
24
|
-
require_relative "datasets/penguins"
|
25
|
-
require_relative "datasets/penn-treebank"
|
26
|
-
require_relative "datasets/pmjt-dataset-list"
|
27
|
-
require_relative "datasets/postal-code-japan"
|
28
|
-
require_relative "datasets/quora-duplicate-question-pair"
|
29
|
-
require_relative "datasets/rdataset"
|
30
|
-
require_relative "datasets/seaborn"
|
31
|
-
require_relative "datasets/sudachi-synonym-dictionary"
|
32
|
-
require_relative "datasets/wikipedia"
|
33
|
-
require_relative "datasets/wikipedia-kyoto-japanese-english"
|
34
|
-
require_relative "datasets/wine"
|
1
|
+
require_relative "datasets/lazy"
|
2
|
+
Datasets::LAZY_LOADER.load_all
|
data/test/test-geolonia.rb
CHANGED
@@ -6,7 +6,7 @@ class GeoloniaTest < Test::Unit::TestCase
|
|
6
6
|
test('#each') do
|
7
7
|
records = @dataset.each.to_a
|
8
8
|
assert_equal([
|
9
|
-
|
9
|
+
277616,
|
10
10
|
{
|
11
11
|
:prefecture_code => "01",
|
12
12
|
:prefecture_name => "北海道",
|
@@ -28,16 +28,16 @@ class GeoloniaTest < Test::Unit::TestCase
|
|
28
28
|
:prefecture_name => "沖縄県",
|
29
29
|
:prefecture_kana => "オキナワケン",
|
30
30
|
:prefecture_romaji => "OKINAWA KEN",
|
31
|
-
:municipality_code => "
|
32
|
-
:municipality_name => "
|
33
|
-
:municipality_kana => "
|
34
|
-
:municipality_romaji => "
|
35
|
-
:street_name => "
|
31
|
+
:municipality_code => "47382",
|
32
|
+
:municipality_name => "八重山郡与那国町",
|
33
|
+
:municipality_kana => "ヤエヤマグンヨナグニチョウ",
|
34
|
+
:municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
|
35
|
+
:street_name => "字与那国",
|
36
36
|
:street_kana => nil,
|
37
37
|
:street_romaji => nil,
|
38
|
-
:alias =>
|
39
|
-
:latitude => "
|
40
|
-
:longitude => "
|
38
|
+
:alias => nil,
|
39
|
+
:latitude => "24.455925",
|
40
|
+
:longitude => "122.987678",
|
41
41
|
},
|
42
42
|
],
|
43
43
|
[
|
@@ -55,6 +55,7 @@ class GeoloniaTest < Test::Unit::TestCase
|
|
55
55
|
"## 住所データ仕様",
|
56
56
|
"### ファイルフォーマット",
|
57
57
|
"### 列",
|
58
|
+
"### ソート順",
|
58
59
|
],
|
59
60
|
description.scan(/^#.*$/),
|
60
61
|
description)
|
@@ -0,0 +1,132 @@
|
|
1
|
+
class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::NagoyaUniversityConversationCorpus.new
|
4
|
+
end
|
5
|
+
|
6
|
+
sub_test_case("each") do
|
7
|
+
test("#sentences") do
|
8
|
+
records = @dataset.each.to_a
|
9
|
+
first_sentences = records[0].sentences
|
10
|
+
last_sentences = records[-1].sentences
|
11
|
+
assert_equal([
|
12
|
+
856,
|
13
|
+
{
|
14
|
+
participant_id: 'F107',
|
15
|
+
content: '***の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても1時間ぐらいですよね。'
|
16
|
+
},
|
17
|
+
{
|
18
|
+
participant_id: nil,
|
19
|
+
content: nil
|
20
|
+
},
|
21
|
+
603,
|
22
|
+
{
|
23
|
+
participant_id: 'F007',
|
24
|
+
content: 'それでは話を始めまーす。'
|
25
|
+
},
|
26
|
+
{
|
27
|
+
participant_id: nil,
|
28
|
+
content: nil
|
29
|
+
}
|
30
|
+
],
|
31
|
+
[
|
32
|
+
first_sentences.size,
|
33
|
+
first_sentences[0].to_h,
|
34
|
+
first_sentences[-1].to_h,
|
35
|
+
last_sentences.size,
|
36
|
+
last_sentences[0].to_h,
|
37
|
+
last_sentences[-1].to_h,
|
38
|
+
])
|
39
|
+
end
|
40
|
+
|
41
|
+
test("#participants") do
|
42
|
+
records = @dataset.each.to_a
|
43
|
+
first_participants = records[0].participants
|
44
|
+
last_participants = records[-1].participants
|
45
|
+
assert_equal([
|
46
|
+
4,
|
47
|
+
{
|
48
|
+
id: 'F107',
|
49
|
+
attribute: '女性30代後半',
|
50
|
+
birthplace: '愛知県幡豆郡出身',
|
51
|
+
residence: '愛知県幡豆郡在住'
|
52
|
+
},
|
53
|
+
{
|
54
|
+
id: 'F128',
|
55
|
+
attribute: '女性20代前半',
|
56
|
+
birthplace: '愛知県西尾市出身',
|
57
|
+
residence: '西尾市在住'
|
58
|
+
},
|
59
|
+
2,
|
60
|
+
{
|
61
|
+
id: 'F007',
|
62
|
+
attribute: '女性50代後半',
|
63
|
+
birthplace: '東京都出身',
|
64
|
+
residence: '東京都国分寺市在住'
|
65
|
+
},
|
66
|
+
{
|
67
|
+
id: 'F003',
|
68
|
+
attribute: '女性80代後半',
|
69
|
+
birthplace: '栃木県宇都宮市出身',
|
70
|
+
residence: '国分寺市在住'
|
71
|
+
}
|
72
|
+
],
|
73
|
+
[
|
74
|
+
first_participants.size,
|
75
|
+
first_participants[0].to_h,
|
76
|
+
first_participants[-1].to_h,
|
77
|
+
last_participants.size,
|
78
|
+
last_participants[0].to_h,
|
79
|
+
last_participants[-1].to_h
|
80
|
+
])
|
81
|
+
end
|
82
|
+
|
83
|
+
test("others") do
|
84
|
+
records = @dataset.each.to_a
|
85
|
+
assert_equal([
|
86
|
+
129,
|
87
|
+
[
|
88
|
+
'1(約35分)',
|
89
|
+
'2001年10月16日',
|
90
|
+
'ファミリーレストラン',
|
91
|
+
'英会話教室の友人',
|
92
|
+
nil
|
93
|
+
],
|
94
|
+
[
|
95
|
+
'129(36分)',
|
96
|
+
'2003年2月16日',
|
97
|
+
'二人の自宅',
|
98
|
+
'母と娘',
|
99
|
+
'F007は東京に38年、F003は東京に60年居住。'
|
100
|
+
]
|
101
|
+
],
|
102
|
+
[
|
103
|
+
records.size,
|
104
|
+
[
|
105
|
+
records[0].name,
|
106
|
+
records[0].date,
|
107
|
+
records[0].place,
|
108
|
+
records[0].relationships,
|
109
|
+
records[0].note
|
110
|
+
],
|
111
|
+
[
|
112
|
+
records[-1].name,
|
113
|
+
records[-1].date,
|
114
|
+
records[-1].place,
|
115
|
+
records[-1].relationships,
|
116
|
+
records[-1].note
|
117
|
+
]
|
118
|
+
])
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
sub_test_case("#metadata") do
|
123
|
+
test("#description") do
|
124
|
+
description = @dataset.metadata.description
|
125
|
+
assert_equal(<<~DESCRIPTION, description)
|
126
|
+
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
|
127
|
+
total about 100 hours of chatting among native speakers of Japanese,
|
128
|
+
which is converted into text.
|
129
|
+
DESCRIPTION
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
data/test/test-rdataset.rb
CHANGED
@@ -8,7 +8,7 @@ class RdatasetTest < Test::Unit::TestCase
|
|
8
8
|
test("with package_name") do
|
9
9
|
records = @dataset.filter(package: "datasets").to_a
|
10
10
|
assert_equal([
|
11
|
-
|
11
|
+
102,
|
12
12
|
{
|
13
13
|
package: "datasets",
|
14
14
|
dataset: "ability.cov",
|
@@ -48,7 +48,7 @@ class RdatasetTest < Test::Unit::TestCase
|
|
48
48
|
test("without package_name") do
|
49
49
|
records = @dataset.each.to_a
|
50
50
|
assert_equal([
|
51
|
-
|
51
|
+
2142,
|
52
52
|
{
|
53
53
|
package: "AER",
|
54
54
|
dataset: "Affairs",
|
data/test/test-seaborn.rb
CHANGED
@@ -6,7 +6,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
|
6
6
|
test('#each') do
|
7
7
|
records = @dataset.each.to_a
|
8
8
|
assert_equal([
|
9
|
-
|
9
|
+
65206,
|
10
10
|
{
|
11
11
|
group_id: "000001",
|
12
12
|
is_noun: true,
|
@@ -19,7 +19,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
|
19
19
|
notation: "曖昧",
|
20
20
|
},
|
21
21
|
{
|
22
|
-
group_id: "
|
22
|
+
group_id: "024916",
|
23
23
|
is_noun: true,
|
24
24
|
expansion_type: :expanded,
|
25
25
|
lexeme_id: 1,
|
@@ -27,7 +27,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
|
27
27
|
acronym_type: :alphabet,
|
28
28
|
variant_type: :typical,
|
29
29
|
categories: ["ビジネス"],
|
30
|
-
notation: "
|
30
|
+
notation: "SCM",
|
31
31
|
},
|
32
32
|
],
|
33
33
|
[
|
data/test/test-wikipedia.rb
CHANGED
@@ -1,100 +1,54 @@
|
|
1
1
|
class WikipediaTest < Test::Unit::TestCase
|
2
|
-
sub_test_case("
|
2
|
+
sub_test_case("en") do
|
3
3
|
sub_test_case("articles") do
|
4
|
-
include Helper::Sandbox
|
5
|
-
|
6
4
|
def setup
|
7
|
-
|
8
|
-
@dataset = Datasets::Wikipedia.new(language: :ja,
|
5
|
+
@dataset = Datasets::Wikipedia.new(language: :en,
|
9
6
|
type: :articles)
|
10
|
-
def @dataset.cache_dir_path
|
11
|
-
@cache_dir_path
|
12
|
-
end
|
13
|
-
def @dataset.cache_dir_path=(path)
|
14
|
-
@cache_dir_path = path
|
15
|
-
end
|
16
|
-
@dataset.cache_dir_path = @tmp_dir
|
17
|
-
end
|
18
|
-
|
19
|
-
def teardown
|
20
|
-
teardown_sandbox
|
21
7
|
end
|
22
8
|
|
23
9
|
test("#each") do
|
24
|
-
|
25
|
-
xml_path = output_path.sub_ext("")
|
26
|
-
xml_path.open("w") do |xml_file|
|
27
|
-
xml_file.puts(<<-XML)
|
28
|
-
<mediawiki
|
29
|
-
xmlns="http://www.mediawiki.org/xml/export-0.10/"
|
30
|
-
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
31
|
-
xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
|
32
|
-
version="0.10" xml:lang="ja">
|
33
|
-
<siteinfo>
|
34
|
-
<sitename>Wikipedia</sitename>
|
35
|
-
</siteinfo>
|
36
|
-
<page>
|
37
|
-
<title>タイトル</title>
|
38
|
-
<ns>4</ns>
|
39
|
-
<id>1</id>
|
40
|
-
<restrictions>sysop</restrictions>
|
41
|
-
<revision>
|
42
|
-
<id>3</id>
|
43
|
-
<parentid>2</parentid>
|
44
|
-
<timestamp>2004-04-30T14:46:00Z</timestamp>
|
45
|
-
<contributor>
|
46
|
-
<username>user</username>
|
47
|
-
<id>10</id>
|
48
|
-
</contributor>
|
49
|
-
<minor />
|
50
|
-
<comment>コメント</comment>
|
51
|
-
<model>wikitext</model>
|
52
|
-
<format>text/x-wiki</format>
|
53
|
-
<text xml:space="preserve">テキスト</text>
|
54
|
-
<sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
|
55
|
-
</revision>
|
56
|
-
</page>
|
57
|
-
</mediawiki>
|
58
|
-
XML
|
59
|
-
end
|
60
|
-
unless system("bzip2", xml_path.to_s)
|
61
|
-
raise "failed to run bzip2"
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
contributor = Datasets::Wikipedia::Contributor.new("user", 10)
|
10
|
+
contributor = Datasets::Wikipedia::Contributor.new("Elli", 20842734)
|
66
11
|
revision = Datasets::Wikipedia::Revision.new
|
67
|
-
revision.id =
|
68
|
-
revision.parent_id =
|
69
|
-
revision.timestamp = Time.iso8601("
|
12
|
+
revision.id = 1002250816
|
13
|
+
revision.parent_id = 854851586
|
14
|
+
revision.timestamp = Time.iso8601("2021-01-23T15:15:01Z")
|
70
15
|
revision.contributor = contributor
|
71
|
-
revision.comment = "
|
16
|
+
revision.comment = "shel"
|
72
17
|
revision.model = "wikitext"
|
73
18
|
revision.format = "text/x-wiki"
|
74
|
-
revision.text =
|
75
|
-
|
19
|
+
revision.text = <<-TEXT.chomp
|
20
|
+
#REDIRECT [[Computer accessibility]]
|
21
|
+
|
22
|
+
{{rcat shell|
|
23
|
+
{{R from move}}
|
24
|
+
{{R from CamelCase}}
|
25
|
+
{{R unprintworthy}}
|
26
|
+
}}
|
27
|
+
TEXT
|
28
|
+
revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
|
76
29
|
page = Datasets::Wikipedia::Page.new
|
77
|
-
page.title = "
|
78
|
-
page.namespace =
|
79
|
-
page.id =
|
80
|
-
page.restrictions =
|
30
|
+
page.title = "AccessibleComputing"
|
31
|
+
page.namespace = 0
|
32
|
+
page.id = 10
|
33
|
+
page.restrictions = nil
|
34
|
+
page.redirect = "Computer accessibility"
|
81
35
|
page.revision = revision
|
82
36
|
assert_equal(page, @dataset.each.first)
|
83
37
|
end
|
84
38
|
|
85
39
|
sub_test_case("#metadata") do
|
86
40
|
test("#id") do
|
87
|
-
assert_equal("wikipedia-
|
41
|
+
assert_equal("wikipedia-en-articles",
|
88
42
|
@dataset.metadata.id)
|
89
43
|
end
|
90
44
|
|
91
45
|
test("#name") do
|
92
|
-
assert_equal("Wikipedia articles (
|
46
|
+
assert_equal("Wikipedia articles (en)",
|
93
47
|
@dataset.metadata.name)
|
94
48
|
end
|
95
49
|
|
96
50
|
test("#description") do
|
97
|
-
assert_equal("Wikipedia articles in
|
51
|
+
assert_equal("Wikipedia articles in en",
|
98
52
|
@dataset.metadata.description)
|
99
53
|
end
|
100
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2023-05-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -163,6 +163,7 @@ files:
|
|
163
163
|
- lib/datasets/iris.rb
|
164
164
|
- lib/datasets/ita-corpus.rb
|
165
165
|
- lib/datasets/kuzushiji-mnist.rb
|
166
|
+
- lib/datasets/lazy.rb
|
166
167
|
- lib/datasets/libsvm-dataset-list.rb
|
167
168
|
- lib/datasets/libsvm.rb
|
168
169
|
- lib/datasets/license.rb
|
@@ -170,6 +171,7 @@ files:
|
|
170
171
|
- lib/datasets/metadata.rb
|
171
172
|
- lib/datasets/mnist.rb
|
172
173
|
- lib/datasets/mushroom.rb
|
174
|
+
- lib/datasets/nagoya-university-conversation-corpus.rb
|
173
175
|
- lib/datasets/penguins.rb
|
174
176
|
- lib/datasets/penn-treebank.rb
|
175
177
|
- lib/datasets/pmjt-dataset-list.rb
|
@@ -214,6 +216,7 @@ files:
|
|
214
216
|
- test/test-metadata.rb
|
215
217
|
- test/test-mnist.rb
|
216
218
|
- test/test-mushroom.rb
|
219
|
+
- test/test-nagoya-university-conversation-corpus.rb
|
217
220
|
- test/test-penguins.rb
|
218
221
|
- test/test-penn-treebank.rb
|
219
222
|
- test/test-pmjt-dataset-list.rb
|
@@ -245,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
245
248
|
- !ruby/object:Gem::Version
|
246
249
|
version: '0'
|
247
250
|
requirements: []
|
248
|
-
rubygems_version: 3.
|
251
|
+
rubygems_version: 3.5.0.dev
|
249
252
|
signing_key:
|
250
253
|
specification_version: 4
|
251
254
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
@@ -278,6 +281,7 @@ test_files:
|
|
278
281
|
- test/test-metadata.rb
|
279
282
|
- test/test-mnist.rb
|
280
283
|
- test/test-mushroom.rb
|
284
|
+
- test/test-nagoya-university-conversation-corpus.rb
|
281
285
|
- test/test-penguins.rb
|
282
286
|
- test/test-penn-treebank.rb
|
283
287
|
- test/test-pmjt-dataset-list.rb
|