red-datasets 0.1.4 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -3
- data/Rakefile +56 -1
- data/doc/text/news.md +102 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +58 -23
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +110 -30
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +6 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +16 -8
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +48 -0
- data/lib/datasets.rb +2 -22
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +65 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- data/test/test-wikipedia.rb +25 -71
- metadata +62 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,90 @@
|
|
1
|
+
require_relative "version"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class LazyLoader
|
5
|
+
def initialize
|
6
|
+
@constants = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def exist?(constant_name)
|
10
|
+
@constants.key?(constant_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
def load(constant_name)
|
14
|
+
feature = @constants[constant_name]
|
15
|
+
raise LoadError, "unknown dataset: #{constant_name}" unless feature
|
16
|
+
require feature
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_all
|
20
|
+
@constants.each_value do |feature|
|
21
|
+
require feature
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def register(constant_name, feature)
|
26
|
+
@constants[constant_name] = feature
|
27
|
+
end
|
28
|
+
|
29
|
+
def constant_names
|
30
|
+
@constants.keys
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
LAZY_LOADER = LazyLoader.new
|
35
|
+
|
36
|
+
class << self
|
37
|
+
def const_missing(name)
|
38
|
+
if LAZY_LOADER.exist?(name)
|
39
|
+
LAZY_LOADER.load(name)
|
40
|
+
const_get(name)
|
41
|
+
else
|
42
|
+
super
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
LAZY_LOADER.register(:Adult, "datasets/adult")
|
48
|
+
LAZY_LOADER.register(:AFINN, "datasets/afinn")
|
49
|
+
LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
|
50
|
+
LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
|
51
|
+
LAZY_LOADER.register(:CIFAR, "datasets/cifar")
|
52
|
+
LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
|
53
|
+
LAZY_LOADER.register(:Communities, "datasets/communities")
|
54
|
+
LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
|
55
|
+
LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
|
56
|
+
LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
|
57
|
+
LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
|
58
|
+
LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
|
59
|
+
LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
|
60
|
+
LAZY_LOADER.register(:Iris, "datasets/iris")
|
61
|
+
LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
|
62
|
+
LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
|
63
|
+
LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
|
64
|
+
LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
|
65
|
+
LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
|
66
|
+
LAZY_LOADER.register(:MNIST, "datasets/mnist")
|
67
|
+
LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
|
68
|
+
LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
|
69
|
+
"datasets/nagoya-university-conversation-corpus")
|
70
|
+
LAZY_LOADER.register(:Penguins, "datasets/penguins")
|
71
|
+
LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
|
72
|
+
LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
|
73
|
+
LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
|
74
|
+
LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
|
75
|
+
"datasets/quora-duplicate-question-pair")
|
76
|
+
LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
|
77
|
+
# For backward compatibility
|
78
|
+
LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
|
79
|
+
LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
|
80
|
+
# For backward compatibility
|
81
|
+
LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
|
82
|
+
LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
|
83
|
+
LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
|
84
|
+
LAZY_LOADER.register(:SudachiSynonymDictionary,
|
85
|
+
"datasets/sudachi-synonym-dictionary")
|
86
|
+
LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
|
87
|
+
LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
|
88
|
+
"datasets/wikipedia-kyoto-japanese-english")
|
89
|
+
LAZY_LOADER.register(:Wine, "datasets/wine")
|
90
|
+
end
|
@@ -28,6 +28,7 @@ module Datasets
|
|
28
28
|
@metadata.id = "libsvm-dataset-list"
|
29
29
|
@metadata.name = "LIBSVM dataset list"
|
30
30
|
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
31
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
31
32
|
@metadata.description = lambda do
|
32
33
|
extract_description
|
33
34
|
end
|
@@ -51,10 +52,8 @@ module Datasets
|
|
51
52
|
private
|
52
53
|
def open_data
|
53
54
|
data_path = cache_dir_path + "index.html"
|
54
|
-
|
55
|
-
|
56
|
-
end
|
57
|
-
::File.open(data_path) do |input|
|
55
|
+
download(data_path, @metadata.url)
|
56
|
+
data_path.open do |input|
|
58
57
|
yield(input)
|
59
58
|
end
|
60
59
|
end
|
@@ -78,10 +77,8 @@ module Datasets
|
|
78
77
|
|
79
78
|
def open_detail(detail)
|
80
79
|
data_path = cache_dir_path + detail
|
81
|
-
|
82
|
-
|
83
|
-
end
|
84
|
-
::File.open(data_path) do |input|
|
80
|
+
download(data_path, @metadata.url + detail)
|
81
|
+
data_path.open do |input|
|
85
82
|
yield(input)
|
86
83
|
end
|
87
84
|
end
|
data/lib/datasets/libsvm.rb
CHANGED
@@ -41,6 +41,7 @@ module Datasets
|
|
41
41
|
@metadata.id = "libsvm-#{normalize_name(name)}"
|
42
42
|
@metadata.name = "LIBSVM dataset: #{name}"
|
43
43
|
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
44
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
44
45
|
end
|
45
46
|
|
46
47
|
def each
|
@@ -99,13 +100,11 @@ module Datasets
|
|
99
100
|
|
100
101
|
def open_data(&block)
|
101
102
|
data_path = cache_dir_path + @file.name
|
102
|
-
|
103
|
-
download(data_path, @file.url)
|
104
|
-
end
|
103
|
+
download(data_path, @file.url)
|
105
104
|
if data_path.extname == ".bz2"
|
106
105
|
extract_bz2(data_path, &block)
|
107
106
|
else
|
108
|
-
|
107
|
+
data_path.open(&block)
|
109
108
|
end
|
110
109
|
end
|
111
110
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Datasets
|
2
|
+
class License < Struct.new(:spdx_id,
|
3
|
+
:name,
|
4
|
+
:url)
|
5
|
+
class << self
|
6
|
+
def try_convert(value)
|
7
|
+
case value
|
8
|
+
when self
|
9
|
+
value
|
10
|
+
when String
|
11
|
+
license = new
|
12
|
+
license.spdx_id = value
|
13
|
+
license
|
14
|
+
when Hash
|
15
|
+
license = new
|
16
|
+
license.spdx_id = value[:spdx_id]
|
17
|
+
license.name = value[:name]
|
18
|
+
license.url = value[:url]
|
19
|
+
license
|
20
|
+
else
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar-gz-readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class LivedoorNews < Dataset
|
6
|
+
include TarGzReadable
|
7
|
+
Record = Struct.new(:url,
|
8
|
+
:timestamp,
|
9
|
+
:sentence)
|
10
|
+
|
11
|
+
def initialize(type: :topic_news)
|
12
|
+
news_list = [
|
13
|
+
:topic_news,
|
14
|
+
:sports_watch,
|
15
|
+
:it_life_hack,
|
16
|
+
:kaden_channel,
|
17
|
+
:movie_enter,
|
18
|
+
:dokujo_tsushin,
|
19
|
+
:smax,
|
20
|
+
:livedoor_homme,
|
21
|
+
:peachy
|
22
|
+
]
|
23
|
+
unless news_list.include?(type)
|
24
|
+
valid_type_labels = news_list.collect(&:inspect).join(", ")
|
25
|
+
message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
|
26
|
+
raise ArgumentError, message
|
27
|
+
end
|
28
|
+
|
29
|
+
super()
|
30
|
+
@type = type
|
31
|
+
@metadata.id = 'livedoor-news'
|
32
|
+
@metadata.name = 'livedoor-news'
|
33
|
+
@metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
|
34
|
+
@metadata.licenses = ['CC-BY-ND-2.1-JP']
|
35
|
+
@metadata.description = lambda do
|
36
|
+
fetch_readme
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def each(&block)
|
41
|
+
return to_enum(__method__) unless block_given?
|
42
|
+
|
43
|
+
data_path = download_tar_gz
|
44
|
+
parse_data(data_path, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def download_tar_gz
|
49
|
+
data_path = cache_dir_path + "livedoor-news.tar.gz"
|
50
|
+
data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
|
51
|
+
download(data_path, data_url)
|
52
|
+
data_path
|
53
|
+
end
|
54
|
+
|
55
|
+
def fetch_readme
|
56
|
+
data_path = download_tar_gz
|
57
|
+
target_file_name = 'text/README.txt'
|
58
|
+
open_tar_gz(data_path) do |tar|
|
59
|
+
tar.seek(target_file_name) do |entry|
|
60
|
+
return entry.read.force_encoding("UTF-8")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_data(data_path, &block)
|
66
|
+
target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
|
67
|
+
open_tar_gz(data_path) do |tar|
|
68
|
+
tar.each do |entry|
|
69
|
+
next unless entry.file?
|
70
|
+
directory_name, base_name = File.split(entry.full_name)
|
71
|
+
next unless directory_name == target_directory_name
|
72
|
+
next if base_name == "LICENSE.txt"
|
73
|
+
url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
|
74
|
+
record = Record.new(url, Time.iso8601(timestamp), sentence)
|
75
|
+
yield(record)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/datasets/metadata.rb
CHANGED
@@ -1,9 +1,23 @@
|
|
1
|
+
require_relative "license"
|
2
|
+
|
1
3
|
module Datasets
|
2
4
|
class Metadata < Struct.new(:id,
|
3
5
|
:name,
|
4
6
|
:url,
|
5
7
|
:licenses,
|
6
8
|
:description)
|
9
|
+
def licenses=(licenses)
|
10
|
+
licenses = [licenses] unless licenses.is_a?(Array)
|
11
|
+
licenses = licenses.collect do |license|
|
12
|
+
l = License.try_convert(license)
|
13
|
+
if l.nil?
|
14
|
+
raise ArgumentError.new("invalid license: #{license.inspect}")
|
15
|
+
end
|
16
|
+
l
|
17
|
+
end
|
18
|
+
super(licenses)
|
19
|
+
end
|
20
|
+
|
7
21
|
def description
|
8
22
|
description_raw = super
|
9
23
|
if description_raw.respond_to?(:call)
|
data/lib/datasets/mnist.rb
CHANGED
@@ -28,6 +28,7 @@ module Datasets
|
|
28
28
|
@metadata.id = "#{dataset_name.downcase}-#{type}"
|
29
29
|
@metadata.name = "#{dataset_name}: #{type}"
|
30
30
|
@metadata.url = self.class::BASE_URL
|
31
|
+
@metadata.licenses = licenses
|
31
32
|
@type = type
|
32
33
|
|
33
34
|
case type
|
@@ -45,18 +46,17 @@ module Datasets
|
|
45
46
|
label_path = cache_dir_path + target_file(:label)
|
46
47
|
base_url = self.class::BASE_URL
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
unless label_path.exist?
|
53
|
-
download(label_path, base_url + target_file(:label))
|
54
|
-
end
|
49
|
+
download(image_path, base_url + target_file(:image))
|
50
|
+
download(label_path, base_url + target_file(:label))
|
55
51
|
|
56
52
|
open_data(image_path, label_path, &block)
|
57
53
|
end
|
58
54
|
|
59
55
|
private
|
56
|
+
def licenses
|
57
|
+
[]
|
58
|
+
end
|
59
|
+
|
60
60
|
def open_data(image_path, label_path, &block)
|
61
61
|
labels = parse_labels(label_path)
|
62
62
|
|
data/lib/datasets/mushroom.rb
CHANGED
@@ -35,6 +35,7 @@ module Datasets
|
|
35
35
|
@metadata.id = "mushroom"
|
36
36
|
@metadata.name = "Mushroom"
|
37
37
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
|
38
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
38
39
|
@metadata.description = lambda do
|
39
40
|
read_names
|
40
41
|
end
|
@@ -58,10 +59,8 @@ module Datasets
|
|
58
59
|
private
|
59
60
|
def open_data
|
60
61
|
data_path = cache_dir_path + "agaricus-lepiota.data"
|
61
|
-
|
62
|
-
|
63
|
-
download(data_path, data_url)
|
64
|
-
end
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
|
63
|
+
download(data_path, data_url)
|
65
64
|
CSV.open(data_path) do |csv|
|
66
65
|
yield(csv)
|
67
66
|
end
|
@@ -69,10 +68,8 @@ module Datasets
|
|
69
68
|
|
70
69
|
def read_names
|
71
70
|
names_path = cache_dir_path + "agaricus-lepiota.names"
|
72
|
-
|
73
|
-
|
74
|
-
download(names_path, names_url)
|
75
|
-
end
|
71
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
|
72
|
+
download(names_path, names_url)
|
76
73
|
names_path.read
|
77
74
|
end
|
78
75
|
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class NagoyaUniversityConversationCorpus < Dataset
|
6
|
+
Data = Struct.new(
|
7
|
+
:name,
|
8
|
+
:date,
|
9
|
+
:place,
|
10
|
+
:participants,
|
11
|
+
:relationships,
|
12
|
+
:note,
|
13
|
+
:sentences
|
14
|
+
)
|
15
|
+
|
16
|
+
Participant = Struct.new(
|
17
|
+
:id,
|
18
|
+
:attribute,
|
19
|
+
:birthplace,
|
20
|
+
:residence
|
21
|
+
)
|
22
|
+
|
23
|
+
Sentence = Struct.new(:participant_id, :content) do
|
24
|
+
def end?
|
25
|
+
participant_id.nil? and content.nil?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
super()
|
31
|
+
@metadata.id = 'nagoya-university-conversation-curpus'
|
32
|
+
@metadata.name = 'Nagoya University Conversation Curpus'
|
33
|
+
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
|
34
|
+
@metadata.licenses = ['CC-BY-NC-ND-4.0']
|
35
|
+
@metadata.description = <<~DESCRIPTION
|
36
|
+
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
|
37
|
+
total about 100 hours of chatting among native speakers of Japanese,
|
38
|
+
which is converted into text.
|
39
|
+
DESCRIPTION
|
40
|
+
end
|
41
|
+
|
42
|
+
def each
|
43
|
+
return to_enum(__method__) unless block_given?
|
44
|
+
|
45
|
+
open_data do |input_stream|
|
46
|
+
yield(parse_file(input_stream))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def open_data
|
53
|
+
data_path = cache_dir_path + 'nucc.zip'
|
54
|
+
data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
|
55
|
+
download(data_path, data_url)
|
56
|
+
|
57
|
+
extractor = ZipExtractor.new(data_path)
|
58
|
+
extractor.extract_files do |input_stream|
|
59
|
+
yield(input_stream)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_file(input_stream)
|
64
|
+
data = Data.new
|
65
|
+
participants = []
|
66
|
+
sentences = []
|
67
|
+
|
68
|
+
input_stream.each do |input|
|
69
|
+
input.each_line(chomp: true) do |line|
|
70
|
+
line.force_encoding('utf-8')
|
71
|
+
if line.start_with?('@データ')
|
72
|
+
data.name = line[4..]
|
73
|
+
elsif line.start_with?('@収集年月日')
|
74
|
+
# mixed cases with and without':'
|
75
|
+
data.date = line[6..].delete_prefix(':')
|
76
|
+
elsif line.start_with?('@場所')
|
77
|
+
data.place = line[4..]
|
78
|
+
elsif line.start_with?('@参加者の関係')
|
79
|
+
data.relationships = line.split(':', 2)[1]
|
80
|
+
elsif line.start_with?('@参加者')
|
81
|
+
participant = Participant.new
|
82
|
+
participant.id, profiles = line[4..].split(':', 2)
|
83
|
+
participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
|
84
|
+
|
85
|
+
participants << participant
|
86
|
+
elsif line.start_with?('%com')
|
87
|
+
data.note = line.split(':', 2)[1]
|
88
|
+
elsif line == '@END'
|
89
|
+
sentence = Sentence.new
|
90
|
+
sentence.participant_id = nil
|
91
|
+
sentence.content = nil
|
92
|
+
|
93
|
+
sentences << sentence
|
94
|
+
else
|
95
|
+
sentence = Sentence.new
|
96
|
+
sentence.participant_id, sentence.content = line.split(':', 2)
|
97
|
+
|
98
|
+
sentences << sentence
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
data.participants = participants
|
104
|
+
data.sentences = sentences
|
105
|
+
|
106
|
+
data
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
data/lib/datasets/penguins.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
1
3
|
require_relative "dataset"
|
2
4
|
|
3
5
|
module Datasets
|
@@ -23,10 +25,10 @@ module Datasets
|
|
23
25
|
def initialize
|
24
26
|
super
|
25
27
|
species = self.class.name.split("::").last.downcase
|
26
|
-
@metadata.id = "palmerpenguins
|
28
|
+
@metadata.id = "palmerpenguins-#{species}"
|
27
29
|
@metadata.url = self.class::URL
|
28
|
-
@metadata.licenses = ["CC0"]
|
29
|
-
@data_path = cache_dir_path + "
|
30
|
+
@metadata.licenses = ["CC0-1.0"]
|
31
|
+
@data_path = cache_dir_path + "#{species}.csv"
|
30
32
|
end
|
31
33
|
|
32
34
|
attr_reader :data_path
|
@@ -44,15 +46,11 @@ module Datasets
|
|
44
46
|
end
|
45
47
|
|
46
48
|
private def open_data
|
47
|
-
download
|
49
|
+
download(data_path, metadata.url)
|
48
50
|
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
49
51
|
yield csv
|
50
52
|
end
|
51
53
|
end
|
52
|
-
|
53
|
-
private def download
|
54
|
-
super(data_path, metadata.url)
|
55
|
-
end
|
56
54
|
end
|
57
55
|
|
58
56
|
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
|
@@ -36,10 +36,8 @@ module Datasets
|
|
36
36
|
|
37
37
|
base_name = "ptb.#{@type}.txt"
|
38
38
|
data_path = cache_dir_path + base_name
|
39
|
-
|
40
|
-
|
41
|
-
download(data_path, "#{base_url}/#{base_name}")
|
42
|
-
end
|
39
|
+
base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
|
40
|
+
download(data_path, "#{base_url}/#{base_name}")
|
43
41
|
|
44
42
|
parse_data(data_path, &block)
|
45
43
|
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class PMJTDatasetList < Dataset
|
5
|
+
Record = Struct.new(:unit,
|
6
|
+
:open_data_category,
|
7
|
+
:tag,
|
8
|
+
:release_time,
|
9
|
+
:n_volumes,
|
10
|
+
:type,
|
11
|
+
:publication_year,
|
12
|
+
:original_request_code,
|
13
|
+
:id,
|
14
|
+
:title,
|
15
|
+
:text,
|
16
|
+
:bibliographical_introduction,
|
17
|
+
:year)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super()
|
21
|
+
@metadata.id = "pmjt-dataset-list"
|
22
|
+
@metadata.name = "List of pre-modern Japanese text dataset"
|
23
|
+
@metadata.url = "http://codh.rois.ac.jp/pmjt/"
|
24
|
+
@metadata.licenses = ["CC-BY-SA-4.0"]
|
25
|
+
@metadata.description = <<~DESCRIPTION
|
26
|
+
Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data.
|
27
|
+
In addition, some text has description, transcription, and tagging data.
|
28
|
+
DESCRIPTION
|
29
|
+
|
30
|
+
@data_path = cache_dir_path + (@metadata.id + ".csv")
|
31
|
+
end
|
32
|
+
|
33
|
+
def each(&block)
|
34
|
+
return to_enum(__method__) unless block_given?
|
35
|
+
|
36
|
+
latest_version = "201901"
|
37
|
+
url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv"
|
38
|
+
download(@data_path, url)
|
39
|
+
CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv|
|
40
|
+
csv.each do |row|
|
41
|
+
record = create_record(row)
|
42
|
+
yield record
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def create_record(csv_row)
|
49
|
+
record = Record.new
|
50
|
+
record.unit = csv_row["(単位)"]
|
51
|
+
record.open_data_category = csv_row["オープンデータ分類"]
|
52
|
+
record.tag = csv_row["タグ"]
|
53
|
+
record.release_time = csv_row["公開時期"]
|
54
|
+
record.n_volumes = csv_row["冊数等"]
|
55
|
+
record.type = csv_row["刊・写"]
|
56
|
+
record.publication_year = csv_row["刊年・書写年"]
|
57
|
+
record.original_request_code = csv_row["原本請求記号"]
|
58
|
+
record.id = csv_row["国文研書誌ID"]
|
59
|
+
record.title = csv_row["書名(統一書名)"]
|
60
|
+
record.text = csv_row["本文"]
|
61
|
+
record.bibliographical_introduction = csv_row["解題"]
|
62
|
+
record.year = csv_row["(西暦)"]
|
63
|
+
|
64
|
+
record
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -49,9 +49,7 @@ module Datasets
|
|
49
49
|
@metadata.id = "postal-code-japan-#{@reading}"
|
50
50
|
@metadata.name = "Postal code in Japan (#{@reading})"
|
51
51
|
@metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
|
52
|
-
@metadata.licenses = [
|
53
|
-
"CC0-1.0",
|
54
|
-
]
|
52
|
+
@metadata.licenses = ["CC0-1.0"]
|
55
53
|
@metadata.description = "Postal code in Japan (reading: #{@reading})"
|
56
54
|
end
|
57
55
|
|
@@ -116,9 +114,7 @@ module Datasets
|
|
116
114
|
data_url << "/roman/ken_all_rome.zip"
|
117
115
|
end
|
118
116
|
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
119
|
-
|
120
|
-
download(data_path, data_url)
|
121
|
-
end
|
117
|
+
download(data_path, data_url)
|
122
118
|
|
123
119
|
Zip::File.open(data_path.to_s) do |zip_file|
|
124
120
|
zip_file.each do |entry|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class QuoraDuplicateQuestionPair < Dataset
|
7
|
+
class Record < Struct.new(:id,
|
8
|
+
:first_question_id,
|
9
|
+
:second_question_id,
|
10
|
+
:first_question,
|
11
|
+
:second_question,
|
12
|
+
:duplicated)
|
13
|
+
alias_method :duplicated?, :duplicated
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
super()
|
18
|
+
@metadata.id = "quora-duplicate-question-pair"
|
19
|
+
@metadata.name = "Quora's duplicated question pair dataset"
|
20
|
+
@metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs"
|
21
|
+
@metadata.licenses = [
|
22
|
+
{
|
23
|
+
name: "Quora's Terms of Service",
|
24
|
+
url: "https://www.quora.com/about/tos",
|
25
|
+
}
|
26
|
+
]
|
27
|
+
end
|
28
|
+
|
29
|
+
def each
|
30
|
+
return to_enum(__method__) unless block_given?
|
31
|
+
|
32
|
+
open_data do |csv|
|
33
|
+
csv.each do |row|
|
34
|
+
row["is_duplicate"] = (row["is_duplicate"] == 1)
|
35
|
+
record = Record.new(*row.fields)
|
36
|
+
yield(record)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def open_data
|
43
|
+
data_path = cache_dir_path + "quora_duplicate_questions.tsv"
|
44
|
+
data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
|
45
|
+
download(data_path, data_url)
|
46
|
+
CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
|
47
|
+
yield(csv)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|