red-datasets 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +0 -1
- data/Rakefile +56 -1
- data/doc/text/news.md +16 -0
- data/lib/datasets/dataset.rb +50 -11
- data/lib/datasets/downloader.rb +110 -35
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +2 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +14 -5
- data/lib/datasets/zip-extractor.rb +12 -0
- data/lib/datasets.rb +2 -34
- data/test/test-geolonia.rb +10 -9
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-rdataset.rb +2 -2
- data/test/test-seaborn.rb +1 -0
- data/test/test-sudachi-synonym-dictionary.rb +3 -3
- data/test/test-wikipedia.rb +25 -71
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
|
4
|
+
data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
|
7
|
+
data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252
|
data/README.md
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# Red Datasets
|
2
2
|
|
3
|
-
[](https://travis-ci.org/red-data-tools/red-datasets)
|
4
3
|
[](https://badge.fury.io/rb/red-datasets)
|
5
4
|
|
6
5
|
## Description
|
data/Rakefile
CHANGED
@@ -13,9 +13,64 @@ end
|
|
13
13
|
helper.install
|
14
14
|
spec = helper.gemspec
|
15
15
|
|
16
|
+
task default: :test
|
17
|
+
|
16
18
|
desc "Run tests"
|
17
19
|
task :test do
|
18
20
|
ruby("test/run-test.rb")
|
19
21
|
end
|
20
22
|
|
21
|
-
|
23
|
+
desc "Generate an artifact for GitHub Pages"
|
24
|
+
task :pages do
|
25
|
+
pages_dir = "_site"
|
26
|
+
rm_rf(pages_dir)
|
27
|
+
mkdir_p(pages_dir)
|
28
|
+
|
29
|
+
require "cgi/util"
|
30
|
+
require_relative "lib/datasets/lazy"
|
31
|
+
File.open("#{pages_dir}/index.html", "w") do |index_html|
|
32
|
+
index_html.puts(<<-HTML)
|
33
|
+
<!DOCTYPE html>
|
34
|
+
<html>
|
35
|
+
<head>
|
36
|
+
<meta charset="UTF-8">
|
37
|
+
<title>Red Datasets</title>
|
38
|
+
<style>
|
39
|
+
table {
|
40
|
+
margin-left: 20vw;
|
41
|
+
min-width: 50%;
|
42
|
+
}
|
43
|
+
th {
|
44
|
+
font-size: 30px;
|
45
|
+
padding: 20px;
|
46
|
+
}
|
47
|
+
td {
|
48
|
+
border-bottom: 1px solid #D9DCE0;
|
49
|
+
padding: 20px;
|
50
|
+
font-weight: bold;
|
51
|
+
}
|
52
|
+
</style>
|
53
|
+
</head>
|
54
|
+
<body>
|
55
|
+
<section>
|
56
|
+
<h1>Red Datasets</h1>
|
57
|
+
<table>
|
58
|
+
<thead>
|
59
|
+
<tr><th>Available datasets</th></tr>
|
60
|
+
</thead>
|
61
|
+
<tbody>
|
62
|
+
HTML
|
63
|
+
Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
|
64
|
+
index_html.puts(<<-HTML)
|
65
|
+
<tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
|
66
|
+
HTML
|
67
|
+
end
|
68
|
+
index_html.puts(<<-HTML)
|
69
|
+
</tbody>
|
70
|
+
</table>
|
71
|
+
</section>
|
72
|
+
</body>
|
73
|
+
</html>
|
74
|
+
HTML
|
75
|
+
end
|
76
|
+
end
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,21 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.6 - 2023-05-24
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Added support for lazy loading by `require "datasets/lazy"`.
|
8
|
+
|
9
|
+
* `Datasets::NagoyaUniversityConversationCorpus`: Added.
|
10
|
+
[GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
|
11
|
+
[Patch by matsuura]
|
12
|
+
|
13
|
+
* `Datasets::Wikipedia`: Added support for downloading in background.
|
14
|
+
|
15
|
+
### Thanks
|
16
|
+
|
17
|
+
* matsuura
|
18
|
+
|
3
19
|
## 0.1.5 - 2022-09-22
|
4
20
|
|
5
21
|
### Improvements
|
data/lib/datasets/dataset.rb
CHANGED
@@ -33,20 +33,59 @@ module Datasets
|
|
33
33
|
@cache_path ||= CachePath.new(@metadata.id)
|
34
34
|
end
|
35
35
|
|
36
|
-
def download(output_path, url)
|
36
|
+
def download(output_path, url, &block)
|
37
37
|
downloader = Downloader.new(url)
|
38
|
-
downloader.download(output_path)
|
38
|
+
downloader.download(output_path, &block)
|
39
39
|
end
|
40
40
|
|
41
|
-
def extract_bz2(
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
41
|
+
def extract_bz2(bz2)
|
42
|
+
case bz2
|
43
|
+
when Pathname, String
|
44
|
+
IO.pipe do |input, output|
|
45
|
+
pid = spawn("bzcat", bz2.to_s, {out: output})
|
46
|
+
begin
|
47
|
+
output.close
|
48
|
+
yield(input)
|
49
|
+
ensure
|
50
|
+
input.close
|
51
|
+
Process.waitpid(pid)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
else
|
55
|
+
IO.pipe do |bz2_input, bz2_output|
|
56
|
+
IO.pipe do |plain_input, plain_output|
|
57
|
+
bz2_stop = false
|
58
|
+
bz2_thread = Thread.new do
|
59
|
+
begin
|
60
|
+
bz2.each do |chunk|
|
61
|
+
bz2_output.write(chunk)
|
62
|
+
bz2_output.flush
|
63
|
+
break if bz2_stop
|
64
|
+
end
|
65
|
+
rescue => error
|
66
|
+
message = "Failed to read bzcat input: " +
|
67
|
+
"#{error.class}: #{error.message}"
|
68
|
+
$stderr.puts(message)
|
69
|
+
ensure
|
70
|
+
bz2_output.close
|
71
|
+
end
|
72
|
+
end
|
73
|
+
begin
|
74
|
+
pid = spawn("bzcat", {in: bz2_input, out: plain_output})
|
75
|
+
begin
|
76
|
+
bz2_input.close
|
77
|
+
plain_output.close
|
78
|
+
yield(plain_input)
|
79
|
+
ensure
|
80
|
+
plain_input.close
|
81
|
+
Process.waitpid(pid)
|
82
|
+
end
|
83
|
+
ensure
|
84
|
+
bz2_stop = true
|
85
|
+
bz2_thread.join
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
50
89
|
end
|
51
90
|
end
|
52
91
|
end
|
data/lib/datasets/downloader.rb
CHANGED
@@ -22,50 +22,115 @@ module Datasets
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def download(output_path)
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
def download(output_path, &block)
|
26
|
+
if output_path.exist?
|
27
|
+
yield_chunks(output_path, &block) if block_given?
|
28
|
+
return
|
29
|
+
end
|
29
30
|
|
30
|
-
headers = {
|
31
|
-
"Accept-Encoding" => "identity",
|
32
|
-
"User-Agent" => "Red Datasets/#{VERSION}",
|
33
|
-
}
|
34
|
-
start = nil
|
35
31
|
partial_output_path = Pathname.new("#{output_path}.partial")
|
36
|
-
|
37
|
-
|
38
|
-
headers["Range"] = "bytes=#{start}-"
|
39
|
-
end
|
32
|
+
synchronize(output_path, partial_output_path) do
|
33
|
+
output_path.parent.mkpath
|
40
34
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
35
|
+
n_retries = 0
|
36
|
+
n_max_retries = 5
|
37
|
+
begin
|
38
|
+
headers = {
|
39
|
+
"Accept-Encoding" => "identity",
|
40
|
+
"User-Agent" => "Red Datasets/#{VERSION}",
|
41
|
+
}
|
45
42
|
start = nil
|
46
|
-
|
47
|
-
|
43
|
+
if partial_output_path.exist?
|
44
|
+
start = partial_output_path.size
|
45
|
+
headers["Range"] = "bytes=#{start}-"
|
46
|
+
end
|
47
|
+
|
48
|
+
start_http(@url, headers) do |response|
|
49
|
+
if response.is_a?(Net::HTTPPartialContent)
|
50
|
+
mode = "ab"
|
51
|
+
else
|
52
|
+
start = nil
|
53
|
+
mode = "wb"
|
54
|
+
end
|
48
55
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
56
|
+
base_name = @url.path.split("/").last
|
57
|
+
size_current = 0
|
58
|
+
size_max = response.content_length
|
59
|
+
if start
|
60
|
+
size_current += start
|
61
|
+
size_max += start
|
62
|
+
if block_given? and n_retries.zero?
|
63
|
+
yield_chunks(partial_output_path, &block)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
progress_reporter = ProgressReporter.new(base_name, size_max)
|
67
|
+
partial_output_path.open(mode) do |output|
|
68
|
+
response.read_body do |chunk|
|
69
|
+
size_current += chunk.bytesize
|
70
|
+
progress_reporter.report(size_current)
|
71
|
+
output.write(chunk)
|
72
|
+
yield(chunk) if block_given?
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
FileUtils.mv(partial_output_path, output_path)
|
77
|
+
rescue Net::ReadTimeout => error
|
78
|
+
n_retries += 1
|
79
|
+
retry if n_retries < n_max_retries
|
80
|
+
raise
|
81
|
+
rescue TooManyRedirects => error
|
82
|
+
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
83
|
+
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
55
84
|
end
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private def synchronize(output_path, partial_output_path)
|
89
|
+
begin
|
90
|
+
Process.getpgid(Process.pid)
|
91
|
+
rescue NotImplementedError
|
92
|
+
return yield
|
93
|
+
end
|
94
|
+
|
95
|
+
lock_path = Pathname("#{output_path}.lock")
|
96
|
+
loop do
|
97
|
+
lock_path.parent.mkpath
|
98
|
+
begin
|
99
|
+
lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
|
100
|
+
rescue SystemCallError
|
101
|
+
valid_lock_path = true
|
102
|
+
begin
|
103
|
+
pid = Integer(lock_path.read.chomp, 10)
|
104
|
+
rescue ArgumentError
|
105
|
+
# The process that acquired the lock will be exited before
|
106
|
+
# it stores its process ID.
|
107
|
+
valid_lock_path = (lock_path.mtime > 10)
|
108
|
+
else
|
109
|
+
begin
|
110
|
+
Process.getpgid(pid)
|
111
|
+
rescue SystemCallError
|
112
|
+
# Process that acquired the lock doesn't exist
|
113
|
+
valid_lock_path = false
|
114
|
+
end
|
115
|
+
end
|
116
|
+
if valid_lock_path
|
117
|
+
sleep(1 + rand(10))
|
118
|
+
else
|
119
|
+
lock_path.delete
|
62
120
|
end
|
121
|
+
retry
|
122
|
+
else
|
123
|
+
begin
|
124
|
+
lock.puts(Process.pid.to_s)
|
125
|
+
lock.flush
|
126
|
+
yield
|
127
|
+
ensure
|
128
|
+
lock.close
|
129
|
+
lock_path.delete
|
130
|
+
end
|
131
|
+
break
|
63
132
|
end
|
64
133
|
end
|
65
|
-
FileUtils.mv(partial_output_path, output_path)
|
66
|
-
rescue TooManyRedirects => error
|
67
|
-
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
68
|
-
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
69
134
|
end
|
70
135
|
|
71
136
|
private def start_http(url, headers, limit = 10, &block)
|
@@ -99,6 +164,16 @@ module Datasets
|
|
99
164
|
end
|
100
165
|
end
|
101
166
|
|
167
|
+
private def yield_chunks(path)
|
168
|
+
path.open("rb") do |output|
|
169
|
+
chunk_size = 1024 * 1024
|
170
|
+
chunk = ""
|
171
|
+
while output.read(chunk_size, chunk)
|
172
|
+
yield(chunk)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
102
177
|
class ProgressReporter
|
103
178
|
def initialize(base_name, size_max)
|
104
179
|
@base_name = base_name
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require_relative "version"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class LazyLoader
|
5
|
+
def initialize
|
6
|
+
@constants = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def exist?(constant_name)
|
10
|
+
@constants.key?(constant_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
def load(constant_name)
|
14
|
+
feature = @constants[constant_name]
|
15
|
+
raise LoadError, "unknown dataset: #{constant_name}" unless feature
|
16
|
+
require feature
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_all
|
20
|
+
@constants.each_value do |feature|
|
21
|
+
require feature
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def register(constant_name, feature)
|
26
|
+
@constants[constant_name] = feature
|
27
|
+
end
|
28
|
+
|
29
|
+
def constant_names
|
30
|
+
@constants.keys
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
LAZY_LOADER = LazyLoader.new
|
35
|
+
|
36
|
+
class << self
|
37
|
+
def const_missing(name)
|
38
|
+
if LAZY_LOADER.exist?(name)
|
39
|
+
LAZY_LOADER.load(name)
|
40
|
+
const_get(name)
|
41
|
+
else
|
42
|
+
super
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
LAZY_LOADER.register(:Adult, "datasets/adult")
|
48
|
+
LAZY_LOADER.register(:AFINN, "datasets/afinn")
|
49
|
+
LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
|
50
|
+
LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
|
51
|
+
LAZY_LOADER.register(:CIFAR, "datasets/cifar")
|
52
|
+
LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
|
53
|
+
LAZY_LOADER.register(:Communities, "datasets/communities")
|
54
|
+
LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
|
55
|
+
LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
|
56
|
+
LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
|
57
|
+
LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
|
58
|
+
LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
|
59
|
+
LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
|
60
|
+
LAZY_LOADER.register(:Iris, "datasets/iris")
|
61
|
+
LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
|
62
|
+
LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
|
63
|
+
LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
|
64
|
+
LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
|
65
|
+
LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
|
66
|
+
LAZY_LOADER.register(:MNIST, "datasets/mnist")
|
67
|
+
LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
|
68
|
+
LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
|
69
|
+
"datasets/nagoya-university-conversation-corpus")
|
70
|
+
LAZY_LOADER.register(:Penguins, "datasets/penguins")
|
71
|
+
LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
|
72
|
+
LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
|
73
|
+
LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
|
74
|
+
LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
|
75
|
+
"datasets/quora-duplicate-question-pair")
|
76
|
+
LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
|
77
|
+
# For backward compatibility
|
78
|
+
LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
|
79
|
+
LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
|
80
|
+
# For backward compatibility
|
81
|
+
LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
|
82
|
+
LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
|
83
|
+
LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
|
84
|
+
LAZY_LOADER.register(:SudachiSynonymDictionary,
|
85
|
+
"datasets/sudachi-synonym-dictionary")
|
86
|
+
LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
|
87
|
+
LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
|
88
|
+
"datasets/wikipedia-kyoto-japanese-english")
|
89
|
+
LAZY_LOADER.register(:Wine, "datasets/wine")
|
90
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class NagoyaUniversityConversationCorpus < Dataset
|
6
|
+
Data = Struct.new(
|
7
|
+
:name,
|
8
|
+
:date,
|
9
|
+
:place,
|
10
|
+
:participants,
|
11
|
+
:relationships,
|
12
|
+
:note,
|
13
|
+
:sentences
|
14
|
+
)
|
15
|
+
|
16
|
+
Participant = Struct.new(
|
17
|
+
:id,
|
18
|
+
:attribute,
|
19
|
+
:birthplace,
|
20
|
+
:residence
|
21
|
+
)
|
22
|
+
|
23
|
+
Sentence = Struct.new(:participant_id, :content) do
|
24
|
+
def end?
|
25
|
+
participant_id.nil? and content.nil?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
super()
|
31
|
+
@metadata.id = 'nagoya-university-conversation-curpus'
|
32
|
+
@metadata.name = 'Nagoya University Conversation Curpus'
|
33
|
+
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
|
34
|
+
@metadata.licenses = ['CC-BY-NC-ND-4.0']
|
35
|
+
@metadata.description = <<~DESCRIPTION
|
36
|
+
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
|
37
|
+
total about 100 hours of chatting among native speakers of Japanese,
|
38
|
+
which is converted into text.
|
39
|
+
DESCRIPTION
|
40
|
+
end
|
41
|
+
|
42
|
+
def each
|
43
|
+
return to_enum(__method__) unless block_given?
|
44
|
+
|
45
|
+
open_data do |input_stream|
|
46
|
+
yield(parse_file(input_stream))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def open_data
|
53
|
+
data_path = cache_dir_path + 'nucc.zip'
|
54
|
+
data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
|
55
|
+
download(data_path, data_url)
|
56
|
+
|
57
|
+
extractor = ZipExtractor.new(data_path)
|
58
|
+
extractor.extract_files do |input_stream|
|
59
|
+
yield(input_stream)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_file(input_stream)
|
64
|
+
data = Data.new
|
65
|
+
participants = []
|
66
|
+
sentences = []
|
67
|
+
|
68
|
+
input_stream.each do |input|
|
69
|
+
input.each_line(chomp: true) do |line|
|
70
|
+
line.force_encoding('utf-8')
|
71
|
+
if line.start_with?('@データ')
|
72
|
+
data.name = line[4..]
|
73
|
+
elsif line.start_with?('@収集年月日')
|
74
|
+
# mixed cases with and without':'
|
75
|
+
data.date = line[6..].delete_prefix(':')
|
76
|
+
elsif line.start_with?('@場所')
|
77
|
+
data.place = line[4..]
|
78
|
+
elsif line.start_with?('@参加者の関係')
|
79
|
+
data.relationships = line.split(':', 2)[1]
|
80
|
+
elsif line.start_with?('@参加者')
|
81
|
+
participant = Participant.new
|
82
|
+
participant.id, profiles = line[4..].split(':', 2)
|
83
|
+
participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
|
84
|
+
|
85
|
+
participants << participant
|
86
|
+
elsif line.start_with?('%com')
|
87
|
+
data.note = line.split(':', 2)[1]
|
88
|
+
elsif line == '@END'
|
89
|
+
sentence = Sentence.new
|
90
|
+
sentence.participant_id = nil
|
91
|
+
sentence.content = nil
|
92
|
+
|
93
|
+
sentences << sentence
|
94
|
+
else
|
95
|
+
sentence = Sentence.new
|
96
|
+
sentence.participant_id, sentence.content = line.split(':', 2)
|
97
|
+
|
98
|
+
sentences << sentence
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
data.participants = participants
|
104
|
+
data.sentences = sentences
|
105
|
+
|
106
|
+
data
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
data/lib/datasets/penguins.rb
CHANGED
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -53,13 +53,22 @@ module Datasets
|
|
53
53
|
end
|
54
54
|
|
55
55
|
private
|
56
|
+
def base_name
|
57
|
+
"#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
58
|
+
end
|
59
|
+
|
60
|
+
def data_path
|
61
|
+
cache_dir_path + base_name
|
62
|
+
end
|
63
|
+
|
56
64
|
def open_data(&block)
|
57
|
-
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
58
|
-
data_path = cache_dir_path + base_name
|
59
65
|
data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
|
60
|
-
|
61
|
-
|
62
|
-
|
66
|
+
bz2 = Enumerator.new do |yielder|
|
67
|
+
download(data_path, data_url) do |bz2_chunk|
|
68
|
+
yielder << bz2_chunk
|
69
|
+
end
|
70
|
+
end
|
71
|
+
extract_bz2(bz2, &block)
|
63
72
|
end
|
64
73
|
|
65
74
|
def type_in_path
|
@@ -32,5 +32,17 @@ module Datasets
|
|
32
32
|
end
|
33
33
|
nil
|
34
34
|
end
|
35
|
+
|
36
|
+
def extract_files
|
37
|
+
Zip::File.open(@path) do |zip_file|
|
38
|
+
zip_file.each do |entry|
|
39
|
+
next unless entry.file?
|
40
|
+
|
41
|
+
entry.get_input_stream do |input|
|
42
|
+
yield(input)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
35
47
|
end
|
36
48
|
end
|
data/lib/datasets.rb
CHANGED
@@ -1,34 +1,2 @@
|
|
1
|
-
require_relative "datasets/
|
2
|
-
|
3
|
-
require_relative "datasets/adult"
|
4
|
-
require_relative "datasets/afinn"
|
5
|
-
require_relative "datasets/aozora-bunko"
|
6
|
-
require_relative "datasets/california-housing"
|
7
|
-
require_relative "datasets/cifar"
|
8
|
-
require_relative "datasets/cldr-plurals"
|
9
|
-
require_relative "datasets/communities"
|
10
|
-
require_relative "datasets/diamonds"
|
11
|
-
require_relative "datasets/e-stat-japan"
|
12
|
-
require_relative "datasets/fashion-mnist"
|
13
|
-
require_relative "datasets/fuel-economy"
|
14
|
-
require_relative "datasets/geolonia"
|
15
|
-
require_relative "datasets/hepatitis"
|
16
|
-
require_relative "datasets/iris"
|
17
|
-
require_relative "datasets/ita-corpus"
|
18
|
-
require_relative "datasets/kuzushiji-mnist"
|
19
|
-
require_relative "datasets/libsvm"
|
20
|
-
require_relative "datasets/libsvm-dataset-list"
|
21
|
-
require_relative "datasets/livedoor-news"
|
22
|
-
require_relative "datasets/mnist"
|
23
|
-
require_relative "datasets/mushroom"
|
24
|
-
require_relative "datasets/penguins"
|
25
|
-
require_relative "datasets/penn-treebank"
|
26
|
-
require_relative "datasets/pmjt-dataset-list"
|
27
|
-
require_relative "datasets/postal-code-japan"
|
28
|
-
require_relative "datasets/quora-duplicate-question-pair"
|
29
|
-
require_relative "datasets/rdataset"
|
30
|
-
require_relative "datasets/seaborn"
|
31
|
-
require_relative "datasets/sudachi-synonym-dictionary"
|
32
|
-
require_relative "datasets/wikipedia"
|
33
|
-
require_relative "datasets/wikipedia-kyoto-japanese-english"
|
34
|
-
require_relative "datasets/wine"
|
1
|
+
require_relative "datasets/lazy"
|
2
|
+
Datasets::LAZY_LOADER.load_all
|
data/test/test-geolonia.rb
CHANGED
@@ -6,7 +6,7 @@ class GeoloniaTest < Test::Unit::TestCase
|
|
6
6
|
test('#each') do
|
7
7
|
records = @dataset.each.to_a
|
8
8
|
assert_equal([
|
9
|
-
|
9
|
+
277616,
|
10
10
|
{
|
11
11
|
:prefecture_code => "01",
|
12
12
|
:prefecture_name => "北海道",
|
@@ -28,16 +28,16 @@ class GeoloniaTest < Test::Unit::TestCase
|
|
28
28
|
:prefecture_name => "沖縄県",
|
29
29
|
:prefecture_kana => "オキナワケン",
|
30
30
|
:prefecture_romaji => "OKINAWA KEN",
|
31
|
-
:municipality_code => "
|
32
|
-
:municipality_name => "
|
33
|
-
:municipality_kana => "
|
34
|
-
:municipality_romaji => "
|
35
|
-
:street_name => "
|
31
|
+
:municipality_code => "47382",
|
32
|
+
:municipality_name => "八重山郡与那国町",
|
33
|
+
:municipality_kana => "ヤエヤマグンヨナグニチョウ",
|
34
|
+
:municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
|
35
|
+
:street_name => "字与那国",
|
36
36
|
:street_kana => nil,
|
37
37
|
:street_romaji => nil,
|
38
|
-
:alias =>
|
39
|
-
:latitude => "
|
40
|
-
:longitude => "
|
38
|
+
:alias => nil,
|
39
|
+
:latitude => "24.455925",
|
40
|
+
:longitude => "122.987678",
|
41
41
|
},
|
42
42
|
],
|
43
43
|
[
|
@@ -55,6 +55,7 @@ class GeoloniaTest < Test::Unit::TestCase
|
|
55
55
|
"## 住所データ仕様",
|
56
56
|
"### ファイルフォーマット",
|
57
57
|
"### 列",
|
58
|
+
"### ソート順",
|
58
59
|
],
|
59
60
|
description.scan(/^#.*$/),
|
60
61
|
description)
|
@@ -0,0 +1,132 @@
|
|
1
|
+
class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::NagoyaUniversityConversationCorpus.new
|
4
|
+
end
|
5
|
+
|
6
|
+
sub_test_case("each") do
|
7
|
+
test("#sentences") do
|
8
|
+
records = @dataset.each.to_a
|
9
|
+
first_sentences = records[0].sentences
|
10
|
+
last_sentences = records[-1].sentences
|
11
|
+
assert_equal([
|
12
|
+
856,
|
13
|
+
{
|
14
|
+
participant_id: 'F107',
|
15
|
+
content: '***の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても1時間ぐらいですよね。'
|
16
|
+
},
|
17
|
+
{
|
18
|
+
participant_id: nil,
|
19
|
+
content: nil
|
20
|
+
},
|
21
|
+
603,
|
22
|
+
{
|
23
|
+
participant_id: 'F007',
|
24
|
+
content: 'それでは話を始めまーす。'
|
25
|
+
},
|
26
|
+
{
|
27
|
+
participant_id: nil,
|
28
|
+
content: nil
|
29
|
+
}
|
30
|
+
],
|
31
|
+
[
|
32
|
+
first_sentences.size,
|
33
|
+
first_sentences[0].to_h,
|
34
|
+
first_sentences[-1].to_h,
|
35
|
+
last_sentences.size,
|
36
|
+
last_sentences[0].to_h,
|
37
|
+
last_sentences[-1].to_h,
|
38
|
+
])
|
39
|
+
end
|
40
|
+
|
41
|
+
test("#participants") do
|
42
|
+
records = @dataset.each.to_a
|
43
|
+
first_participants = records[0].participants
|
44
|
+
last_participants = records[-1].participants
|
45
|
+
assert_equal([
|
46
|
+
4,
|
47
|
+
{
|
48
|
+
id: 'F107',
|
49
|
+
attribute: '女性30代後半',
|
50
|
+
birthplace: '愛知県幡豆郡出身',
|
51
|
+
residence: '愛知県幡豆郡在住'
|
52
|
+
},
|
53
|
+
{
|
54
|
+
id: 'F128',
|
55
|
+
attribute: '女性20代前半',
|
56
|
+
birthplace: '愛知県西尾市出身',
|
57
|
+
residence: '西尾市在住'
|
58
|
+
},
|
59
|
+
2,
|
60
|
+
{
|
61
|
+
id: 'F007',
|
62
|
+
attribute: '女性50代後半',
|
63
|
+
birthplace: '東京都出身',
|
64
|
+
residence: '東京都国分寺市在住'
|
65
|
+
},
|
66
|
+
{
|
67
|
+
id: 'F003',
|
68
|
+
attribute: '女性80代後半',
|
69
|
+
birthplace: '栃木県宇都宮市出身',
|
70
|
+
residence: '国分寺市在住'
|
71
|
+
}
|
72
|
+
],
|
73
|
+
[
|
74
|
+
first_participants.size,
|
75
|
+
first_participants[0].to_h,
|
76
|
+
first_participants[-1].to_h,
|
77
|
+
last_participants.size,
|
78
|
+
last_participants[0].to_h,
|
79
|
+
last_participants[-1].to_h
|
80
|
+
])
|
81
|
+
end
|
82
|
+
|
83
|
+
test("others") do
|
84
|
+
records = @dataset.each.to_a
|
85
|
+
assert_equal([
|
86
|
+
129,
|
87
|
+
[
|
88
|
+
'1(約35分)',
|
89
|
+
'2001年10月16日',
|
90
|
+
'ファミリーレストラン',
|
91
|
+
'英会話教室の友人',
|
92
|
+
nil
|
93
|
+
],
|
94
|
+
[
|
95
|
+
'129(36分)',
|
96
|
+
'2003年2月16日',
|
97
|
+
'二人の自宅',
|
98
|
+
'母と娘',
|
99
|
+
'F007は東京に38年、F003は東京に60年居住。'
|
100
|
+
]
|
101
|
+
],
|
102
|
+
[
|
103
|
+
records.size,
|
104
|
+
[
|
105
|
+
records[0].name,
|
106
|
+
records[0].date,
|
107
|
+
records[0].place,
|
108
|
+
records[0].relationships,
|
109
|
+
records[0].note
|
110
|
+
],
|
111
|
+
[
|
112
|
+
records[-1].name,
|
113
|
+
records[-1].date,
|
114
|
+
records[-1].place,
|
115
|
+
records[-1].relationships,
|
116
|
+
records[-1].note
|
117
|
+
]
|
118
|
+
])
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
sub_test_case("#metadata") do
|
123
|
+
test("#description") do
|
124
|
+
description = @dataset.metadata.description
|
125
|
+
assert_equal(<<~DESCRIPTION, description)
|
126
|
+
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
|
127
|
+
total about 100 hours of chatting among native speakers of Japanese,
|
128
|
+
which is converted into text.
|
129
|
+
DESCRIPTION
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
data/test/test-rdataset.rb
CHANGED
@@ -8,7 +8,7 @@ class RdatasetTest < Test::Unit::TestCase
|
|
8
8
|
test("with package_name") do
|
9
9
|
records = @dataset.filter(package: "datasets").to_a
|
10
10
|
assert_equal([
|
11
|
-
|
11
|
+
102,
|
12
12
|
{
|
13
13
|
package: "datasets",
|
14
14
|
dataset: "ability.cov",
|
@@ -48,7 +48,7 @@ class RdatasetTest < Test::Unit::TestCase
|
|
48
48
|
test("without package_name") do
|
49
49
|
records = @dataset.each.to_a
|
50
50
|
assert_equal([
|
51
|
-
|
51
|
+
2142,
|
52
52
|
{
|
53
53
|
package: "AER",
|
54
54
|
dataset: "Affairs",
|
data/test/test-seaborn.rb
CHANGED
@@ -6,7 +6,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
|
6
6
|
test('#each') do
|
7
7
|
records = @dataset.each.to_a
|
8
8
|
assert_equal([
|
9
|
-
|
9
|
+
65206,
|
10
10
|
{
|
11
11
|
group_id: "000001",
|
12
12
|
is_noun: true,
|
@@ -19,7 +19,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
|
19
19
|
notation: "曖昧",
|
20
20
|
},
|
21
21
|
{
|
22
|
-
group_id: "
|
22
|
+
group_id: "024916",
|
23
23
|
is_noun: true,
|
24
24
|
expansion_type: :expanded,
|
25
25
|
lexeme_id: 1,
|
@@ -27,7 +27,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
|
27
27
|
acronym_type: :alphabet,
|
28
28
|
variant_type: :typical,
|
29
29
|
categories: ["ビジネス"],
|
30
|
-
notation: "
|
30
|
+
notation: "SCM",
|
31
31
|
},
|
32
32
|
],
|
33
33
|
[
|
data/test/test-wikipedia.rb
CHANGED
@@ -1,100 +1,54 @@
|
|
1
1
|
class WikipediaTest < Test::Unit::TestCase
|
2
|
-
sub_test_case("
|
2
|
+
sub_test_case("en") do
|
3
3
|
sub_test_case("articles") do
|
4
|
-
include Helper::Sandbox
|
5
|
-
|
6
4
|
def setup
|
7
|
-
|
8
|
-
@dataset = Datasets::Wikipedia.new(language: :ja,
|
5
|
+
@dataset = Datasets::Wikipedia.new(language: :en,
|
9
6
|
type: :articles)
|
10
|
-
def @dataset.cache_dir_path
|
11
|
-
@cache_dir_path
|
12
|
-
end
|
13
|
-
def @dataset.cache_dir_path=(path)
|
14
|
-
@cache_dir_path = path
|
15
|
-
end
|
16
|
-
@dataset.cache_dir_path = @tmp_dir
|
17
|
-
end
|
18
|
-
|
19
|
-
def teardown
|
20
|
-
teardown_sandbox
|
21
7
|
end
|
22
8
|
|
23
9
|
test("#each") do
|
24
|
-
|
25
|
-
xml_path = output_path.sub_ext("")
|
26
|
-
xml_path.open("w") do |xml_file|
|
27
|
-
xml_file.puts(<<-XML)
|
28
|
-
<mediawiki
|
29
|
-
xmlns="http://www.mediawiki.org/xml/export-0.10/"
|
30
|
-
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
31
|
-
xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
|
32
|
-
version="0.10" xml:lang="ja">
|
33
|
-
<siteinfo>
|
34
|
-
<sitename>Wikipedia</sitename>
|
35
|
-
</siteinfo>
|
36
|
-
<page>
|
37
|
-
<title>タイトル</title>
|
38
|
-
<ns>4</ns>
|
39
|
-
<id>1</id>
|
40
|
-
<restrictions>sysop</restrictions>
|
41
|
-
<revision>
|
42
|
-
<id>3</id>
|
43
|
-
<parentid>2</parentid>
|
44
|
-
<timestamp>2004-04-30T14:46:00Z</timestamp>
|
45
|
-
<contributor>
|
46
|
-
<username>user</username>
|
47
|
-
<id>10</id>
|
48
|
-
</contributor>
|
49
|
-
<minor />
|
50
|
-
<comment>コメント</comment>
|
51
|
-
<model>wikitext</model>
|
52
|
-
<format>text/x-wiki</format>
|
53
|
-
<text xml:space="preserve">テキスト</text>
|
54
|
-
<sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
|
55
|
-
</revision>
|
56
|
-
</page>
|
57
|
-
</mediawiki>
|
58
|
-
XML
|
59
|
-
end
|
60
|
-
unless system("bzip2", xml_path.to_s)
|
61
|
-
raise "failed to run bzip2"
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
contributor = Datasets::Wikipedia::Contributor.new("user", 10)
|
10
|
+
contributor = Datasets::Wikipedia::Contributor.new("Elli", 20842734)
|
66
11
|
revision = Datasets::Wikipedia::Revision.new
|
67
|
-
revision.id =
|
68
|
-
revision.parent_id =
|
69
|
-
revision.timestamp = Time.iso8601("
|
12
|
+
revision.id = 1002250816
|
13
|
+
revision.parent_id = 854851586
|
14
|
+
revision.timestamp = Time.iso8601("2021-01-23T15:15:01Z")
|
70
15
|
revision.contributor = contributor
|
71
|
-
revision.comment = "
|
16
|
+
revision.comment = "shel"
|
72
17
|
revision.model = "wikitext"
|
73
18
|
revision.format = "text/x-wiki"
|
74
|
-
revision.text =
|
75
|
-
|
19
|
+
revision.text = <<-TEXT.chomp
|
20
|
+
#REDIRECT [[Computer accessibility]]
|
21
|
+
|
22
|
+
{{rcat shell|
|
23
|
+
{{R from move}}
|
24
|
+
{{R from CamelCase}}
|
25
|
+
{{R unprintworthy}}
|
26
|
+
}}
|
27
|
+
TEXT
|
28
|
+
revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
|
76
29
|
page = Datasets::Wikipedia::Page.new
|
77
|
-
page.title = "
|
78
|
-
page.namespace =
|
79
|
-
page.id =
|
80
|
-
page.restrictions =
|
30
|
+
page.title = "AccessibleComputing"
|
31
|
+
page.namespace = 0
|
32
|
+
page.id = 10
|
33
|
+
page.restrictions = nil
|
34
|
+
page.redirect = "Computer accessibility"
|
81
35
|
page.revision = revision
|
82
36
|
assert_equal(page, @dataset.each.first)
|
83
37
|
end
|
84
38
|
|
85
39
|
sub_test_case("#metadata") do
|
86
40
|
test("#id") do
|
87
|
-
assert_equal("wikipedia-
|
41
|
+
assert_equal("wikipedia-en-articles",
|
88
42
|
@dataset.metadata.id)
|
89
43
|
end
|
90
44
|
|
91
45
|
test("#name") do
|
92
|
-
assert_equal("Wikipedia articles (
|
46
|
+
assert_equal("Wikipedia articles (en)",
|
93
47
|
@dataset.metadata.name)
|
94
48
|
end
|
95
49
|
|
96
50
|
test("#description") do
|
97
|
-
assert_equal("Wikipedia articles in
|
51
|
+
assert_equal("Wikipedia articles in en",
|
98
52
|
@dataset.metadata.description)
|
99
53
|
end
|
100
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2023-05-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -163,6 +163,7 @@ files:
|
|
163
163
|
- lib/datasets/iris.rb
|
164
164
|
- lib/datasets/ita-corpus.rb
|
165
165
|
- lib/datasets/kuzushiji-mnist.rb
|
166
|
+
- lib/datasets/lazy.rb
|
166
167
|
- lib/datasets/libsvm-dataset-list.rb
|
167
168
|
- lib/datasets/libsvm.rb
|
168
169
|
- lib/datasets/license.rb
|
@@ -170,6 +171,7 @@ files:
|
|
170
171
|
- lib/datasets/metadata.rb
|
171
172
|
- lib/datasets/mnist.rb
|
172
173
|
- lib/datasets/mushroom.rb
|
174
|
+
- lib/datasets/nagoya-university-conversation-corpus.rb
|
173
175
|
- lib/datasets/penguins.rb
|
174
176
|
- lib/datasets/penn-treebank.rb
|
175
177
|
- lib/datasets/pmjt-dataset-list.rb
|
@@ -214,6 +216,7 @@ files:
|
|
214
216
|
- test/test-metadata.rb
|
215
217
|
- test/test-mnist.rb
|
216
218
|
- test/test-mushroom.rb
|
219
|
+
- test/test-nagoya-university-conversation-corpus.rb
|
217
220
|
- test/test-penguins.rb
|
218
221
|
- test/test-penn-treebank.rb
|
219
222
|
- test/test-pmjt-dataset-list.rb
|
@@ -245,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
245
248
|
- !ruby/object:Gem::Version
|
246
249
|
version: '0'
|
247
250
|
requirements: []
|
248
|
-
rubygems_version: 3.
|
251
|
+
rubygems_version: 3.5.0.dev
|
249
252
|
signing_key:
|
250
253
|
specification_version: 4
|
251
254
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
@@ -278,6 +281,7 @@ test_files:
|
|
278
281
|
- test/test-metadata.rb
|
279
282
|
- test/test-mnist.rb
|
280
283
|
- test/test-mushroom.rb
|
284
|
+
- test/test-nagoya-university-conversation-corpus.rb
|
281
285
|
- test/test-penguins.rb
|
282
286
|
- test/test-penn-treebank.rb
|
283
287
|
- test/test-pmjt-dataset-list.rb
|