red-datasets 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -3
- data/Rakefile +56 -1
- data/doc/text/news.md +102 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +58 -23
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +110 -30
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +6 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +16 -8
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +48 -0
- data/lib/datasets.rb +2 -22
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +65 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- data/test/test-wikipedia.rb +25 -71
- metadata +62 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,90 @@
|
|
1
|
+
require_relative "version"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class LazyLoader
|
5
|
+
def initialize
|
6
|
+
@constants = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def exist?(constant_name)
|
10
|
+
@constants.key?(constant_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
def load(constant_name)
|
14
|
+
feature = @constants[constant_name]
|
15
|
+
raise LoadError, "unknown dataset: #{constant_name}" unless feature
|
16
|
+
require feature
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_all
|
20
|
+
@constants.each_value do |feature|
|
21
|
+
require feature
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def register(constant_name, feature)
|
26
|
+
@constants[constant_name] = feature
|
27
|
+
end
|
28
|
+
|
29
|
+
def constant_names
|
30
|
+
@constants.keys
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
LAZY_LOADER = LazyLoader.new
|
35
|
+
|
36
|
+
class << self
|
37
|
+
def const_missing(name)
|
38
|
+
if LAZY_LOADER.exist?(name)
|
39
|
+
LAZY_LOADER.load(name)
|
40
|
+
const_get(name)
|
41
|
+
else
|
42
|
+
super
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
LAZY_LOADER.register(:Adult, "datasets/adult")
|
48
|
+
LAZY_LOADER.register(:AFINN, "datasets/afinn")
|
49
|
+
LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
|
50
|
+
LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
|
51
|
+
LAZY_LOADER.register(:CIFAR, "datasets/cifar")
|
52
|
+
LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
|
53
|
+
LAZY_LOADER.register(:Communities, "datasets/communities")
|
54
|
+
LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
|
55
|
+
LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
|
56
|
+
LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
|
57
|
+
LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
|
58
|
+
LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
|
59
|
+
LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
|
60
|
+
LAZY_LOADER.register(:Iris, "datasets/iris")
|
61
|
+
LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
|
62
|
+
LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
|
63
|
+
LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
|
64
|
+
LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
|
65
|
+
LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
|
66
|
+
LAZY_LOADER.register(:MNIST, "datasets/mnist")
|
67
|
+
LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
|
68
|
+
LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
|
69
|
+
"datasets/nagoya-university-conversation-corpus")
|
70
|
+
LAZY_LOADER.register(:Penguins, "datasets/penguins")
|
71
|
+
LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
|
72
|
+
LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
|
73
|
+
LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
|
74
|
+
LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
|
75
|
+
"datasets/quora-duplicate-question-pair")
|
76
|
+
LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
|
77
|
+
# For backward compatibility
|
78
|
+
LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
|
79
|
+
LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
|
80
|
+
# For backward compatibility
|
81
|
+
LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
|
82
|
+
LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
|
83
|
+
LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
|
84
|
+
LAZY_LOADER.register(:SudachiSynonymDictionary,
|
85
|
+
"datasets/sudachi-synonym-dictionary")
|
86
|
+
LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
|
87
|
+
LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
|
88
|
+
"datasets/wikipedia-kyoto-japanese-english")
|
89
|
+
LAZY_LOADER.register(:Wine, "datasets/wine")
|
90
|
+
end
|
@@ -28,6 +28,7 @@ module Datasets
|
|
28
28
|
@metadata.id = "libsvm-dataset-list"
|
29
29
|
@metadata.name = "LIBSVM dataset list"
|
30
30
|
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
31
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
31
32
|
@metadata.description = lambda do
|
32
33
|
extract_description
|
33
34
|
end
|
@@ -51,10 +52,8 @@ module Datasets
|
|
51
52
|
private
|
52
53
|
def open_data
|
53
54
|
data_path = cache_dir_path + "index.html"
|
54
|
-
|
55
|
-
|
56
|
-
end
|
57
|
-
::File.open(data_path) do |input|
|
55
|
+
download(data_path, @metadata.url)
|
56
|
+
data_path.open do |input|
|
58
57
|
yield(input)
|
59
58
|
end
|
60
59
|
end
|
@@ -78,10 +77,8 @@ module Datasets
|
|
78
77
|
|
79
78
|
def open_detail(detail)
|
80
79
|
data_path = cache_dir_path + detail
|
81
|
-
|
82
|
-
|
83
|
-
end
|
84
|
-
::File.open(data_path) do |input|
|
80
|
+
download(data_path, @metadata.url + detail)
|
81
|
+
data_path.open do |input|
|
85
82
|
yield(input)
|
86
83
|
end
|
87
84
|
end
|
data/lib/datasets/libsvm.rb
CHANGED
@@ -41,6 +41,7 @@ module Datasets
|
|
41
41
|
@metadata.id = "libsvm-#{normalize_name(name)}"
|
42
42
|
@metadata.name = "LIBSVM dataset: #{name}"
|
43
43
|
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
44
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
44
45
|
end
|
45
46
|
|
46
47
|
def each
|
@@ -99,13 +100,11 @@ module Datasets
|
|
99
100
|
|
100
101
|
def open_data(&block)
|
101
102
|
data_path = cache_dir_path + @file.name
|
102
|
-
|
103
|
-
download(data_path, @file.url)
|
104
|
-
end
|
103
|
+
download(data_path, @file.url)
|
105
104
|
if data_path.extname == ".bz2"
|
106
105
|
extract_bz2(data_path, &block)
|
107
106
|
else
|
108
|
-
|
107
|
+
data_path.open(&block)
|
109
108
|
end
|
110
109
|
end
|
111
110
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Datasets
|
2
|
+
class License < Struct.new(:spdx_id,
|
3
|
+
:name,
|
4
|
+
:url)
|
5
|
+
class << self
|
6
|
+
def try_convert(value)
|
7
|
+
case value
|
8
|
+
when self
|
9
|
+
value
|
10
|
+
when String
|
11
|
+
license = new
|
12
|
+
license.spdx_id = value
|
13
|
+
license
|
14
|
+
when Hash
|
15
|
+
license = new
|
16
|
+
license.spdx_id = value[:spdx_id]
|
17
|
+
license.name = value[:name]
|
18
|
+
license.url = value[:url]
|
19
|
+
license
|
20
|
+
else
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar-gz-readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class LivedoorNews < Dataset
|
6
|
+
include TarGzReadable
|
7
|
+
Record = Struct.new(:url,
|
8
|
+
:timestamp,
|
9
|
+
:sentence)
|
10
|
+
|
11
|
+
def initialize(type: :topic_news)
|
12
|
+
news_list = [
|
13
|
+
:topic_news,
|
14
|
+
:sports_watch,
|
15
|
+
:it_life_hack,
|
16
|
+
:kaden_channel,
|
17
|
+
:movie_enter,
|
18
|
+
:dokujo_tsushin,
|
19
|
+
:smax,
|
20
|
+
:livedoor_homme,
|
21
|
+
:peachy
|
22
|
+
]
|
23
|
+
unless news_list.include?(type)
|
24
|
+
valid_type_labels = news_list.collect(&:inspect).join(", ")
|
25
|
+
message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
|
26
|
+
raise ArgumentError, message
|
27
|
+
end
|
28
|
+
|
29
|
+
super()
|
30
|
+
@type = type
|
31
|
+
@metadata.id = 'livedoor-news'
|
32
|
+
@metadata.name = 'livedoor-news'
|
33
|
+
@metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
|
34
|
+
@metadata.licenses = ['CC-BY-ND-2.1-JP']
|
35
|
+
@metadata.description = lambda do
|
36
|
+
fetch_readme
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def each(&block)
|
41
|
+
return to_enum(__method__) unless block_given?
|
42
|
+
|
43
|
+
data_path = download_tar_gz
|
44
|
+
parse_data(data_path, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def download_tar_gz
|
49
|
+
data_path = cache_dir_path + "livedoor-news.tar.gz"
|
50
|
+
data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
|
51
|
+
download(data_path, data_url)
|
52
|
+
data_path
|
53
|
+
end
|
54
|
+
|
55
|
+
def fetch_readme
|
56
|
+
data_path = download_tar_gz
|
57
|
+
target_file_name = 'text/README.txt'
|
58
|
+
open_tar_gz(data_path) do |tar|
|
59
|
+
tar.seek(target_file_name) do |entry|
|
60
|
+
return entry.read.force_encoding("UTF-8")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_data(data_path, &block)
|
66
|
+
target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
|
67
|
+
open_tar_gz(data_path) do |tar|
|
68
|
+
tar.each do |entry|
|
69
|
+
next unless entry.file?
|
70
|
+
directory_name, base_name = File.split(entry.full_name)
|
71
|
+
next unless directory_name == target_directory_name
|
72
|
+
next if base_name == "LICENSE.txt"
|
73
|
+
url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
|
74
|
+
record = Record.new(url, Time.iso8601(timestamp), sentence)
|
75
|
+
yield(record)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/datasets/metadata.rb
CHANGED
@@ -1,9 +1,23 @@
|
|
1
|
+
require_relative "license"
|
2
|
+
|
1
3
|
module Datasets
|
2
4
|
class Metadata < Struct.new(:id,
|
3
5
|
:name,
|
4
6
|
:url,
|
5
7
|
:licenses,
|
6
8
|
:description)
|
9
|
+
def licenses=(licenses)
|
10
|
+
licenses = [licenses] unless licenses.is_a?(Array)
|
11
|
+
licenses = licenses.collect do |license|
|
12
|
+
l = License.try_convert(license)
|
13
|
+
if l.nil?
|
14
|
+
raise ArgumentError.new("invalid license: #{license.inspect}")
|
15
|
+
end
|
16
|
+
l
|
17
|
+
end
|
18
|
+
super(licenses)
|
19
|
+
end
|
20
|
+
|
7
21
|
def description
|
8
22
|
description_raw = super
|
9
23
|
if description_raw.respond_to?(:call)
|
data/lib/datasets/mnist.rb
CHANGED
@@ -28,6 +28,7 @@ module Datasets
|
|
28
28
|
@metadata.id = "#{dataset_name.downcase}-#{type}"
|
29
29
|
@metadata.name = "#{dataset_name}: #{type}"
|
30
30
|
@metadata.url = self.class::BASE_URL
|
31
|
+
@metadata.licenses = licenses
|
31
32
|
@type = type
|
32
33
|
|
33
34
|
case type
|
@@ -45,18 +46,17 @@ module Datasets
|
|
45
46
|
label_path = cache_dir_path + target_file(:label)
|
46
47
|
base_url = self.class::BASE_URL
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
unless label_path.exist?
|
53
|
-
download(label_path, base_url + target_file(:label))
|
54
|
-
end
|
49
|
+
download(image_path, base_url + target_file(:image))
|
50
|
+
download(label_path, base_url + target_file(:label))
|
55
51
|
|
56
52
|
open_data(image_path, label_path, &block)
|
57
53
|
end
|
58
54
|
|
59
55
|
private
|
56
|
+
def licenses
|
57
|
+
[]
|
58
|
+
end
|
59
|
+
|
60
60
|
def open_data(image_path, label_path, &block)
|
61
61
|
labels = parse_labels(label_path)
|
62
62
|
|
data/lib/datasets/mushroom.rb
CHANGED
@@ -35,6 +35,7 @@ module Datasets
|
|
35
35
|
@metadata.id = "mushroom"
|
36
36
|
@metadata.name = "Mushroom"
|
37
37
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
|
38
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
38
39
|
@metadata.description = lambda do
|
39
40
|
read_names
|
40
41
|
end
|
@@ -58,10 +59,8 @@ module Datasets
|
|
58
59
|
private
|
59
60
|
def open_data
|
60
61
|
data_path = cache_dir_path + "agaricus-lepiota.data"
|
61
|
-
|
62
|
-
|
63
|
-
download(data_path, data_url)
|
64
|
-
end
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
|
63
|
+
download(data_path, data_url)
|
65
64
|
CSV.open(data_path) do |csv|
|
66
65
|
yield(csv)
|
67
66
|
end
|
@@ -69,10 +68,8 @@ module Datasets
|
|
69
68
|
|
70
69
|
def read_names
|
71
70
|
names_path = cache_dir_path + "agaricus-lepiota.names"
|
72
|
-
|
73
|
-
|
74
|
-
download(names_path, names_url)
|
75
|
-
end
|
71
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
|
72
|
+
download(names_path, names_url)
|
76
73
|
names_path.read
|
77
74
|
end
|
78
75
|
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
require_relative 'zip-extractor'
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class NagoyaUniversityConversationCorpus < Dataset
|
6
|
+
Data = Struct.new(
|
7
|
+
:name,
|
8
|
+
:date,
|
9
|
+
:place,
|
10
|
+
:participants,
|
11
|
+
:relationships,
|
12
|
+
:note,
|
13
|
+
:sentences
|
14
|
+
)
|
15
|
+
|
16
|
+
Participant = Struct.new(
|
17
|
+
:id,
|
18
|
+
:attribute,
|
19
|
+
:birthplace,
|
20
|
+
:residence
|
21
|
+
)
|
22
|
+
|
23
|
+
Sentence = Struct.new(:participant_id, :content) do
|
24
|
+
def end?
|
25
|
+
participant_id.nil? and content.nil?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
super()
|
31
|
+
@metadata.id = 'nagoya-university-conversation-curpus'
|
32
|
+
@metadata.name = 'Nagoya University Conversation Curpus'
|
33
|
+
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
|
34
|
+
@metadata.licenses = ['CC-BY-NC-ND-4.0']
|
35
|
+
@metadata.description = <<~DESCRIPTION
|
36
|
+
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
|
37
|
+
total about 100 hours of chatting among native speakers of Japanese,
|
38
|
+
which is converted into text.
|
39
|
+
DESCRIPTION
|
40
|
+
end
|
41
|
+
|
42
|
+
def each
|
43
|
+
return to_enum(__method__) unless block_given?
|
44
|
+
|
45
|
+
open_data do |input_stream|
|
46
|
+
yield(parse_file(input_stream))
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def open_data
|
53
|
+
data_path = cache_dir_path + 'nucc.zip'
|
54
|
+
data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
|
55
|
+
download(data_path, data_url)
|
56
|
+
|
57
|
+
extractor = ZipExtractor.new(data_path)
|
58
|
+
extractor.extract_files do |input_stream|
|
59
|
+
yield(input_stream)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_file(input_stream)
|
64
|
+
data = Data.new
|
65
|
+
participants = []
|
66
|
+
sentences = []
|
67
|
+
|
68
|
+
input_stream.each do |input|
|
69
|
+
input.each_line(chomp: true) do |line|
|
70
|
+
line.force_encoding('utf-8')
|
71
|
+
if line.start_with?('@データ')
|
72
|
+
data.name = line[4..]
|
73
|
+
elsif line.start_with?('@収集年月日')
|
74
|
+
# mixed cases with and without':'
|
75
|
+
data.date = line[6..].delete_prefix(':')
|
76
|
+
elsif line.start_with?('@場所')
|
77
|
+
data.place = line[4..]
|
78
|
+
elsif line.start_with?('@参加者の関係')
|
79
|
+
data.relationships = line.split(':', 2)[1]
|
80
|
+
elsif line.start_with?('@参加者')
|
81
|
+
participant = Participant.new
|
82
|
+
participant.id, profiles = line[4..].split(':', 2)
|
83
|
+
participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
|
84
|
+
|
85
|
+
participants << participant
|
86
|
+
elsif line.start_with?('%com')
|
87
|
+
data.note = line.split(':', 2)[1]
|
88
|
+
elsif line == '@END'
|
89
|
+
sentence = Sentence.new
|
90
|
+
sentence.participant_id = nil
|
91
|
+
sentence.content = nil
|
92
|
+
|
93
|
+
sentences << sentence
|
94
|
+
else
|
95
|
+
sentence = Sentence.new
|
96
|
+
sentence.participant_id, sentence.content = line.split(':', 2)
|
97
|
+
|
98
|
+
sentences << sentence
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
data.participants = participants
|
104
|
+
data.sentences = sentences
|
105
|
+
|
106
|
+
data
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
data/lib/datasets/penguins.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
1
3
|
require_relative "dataset"
|
2
4
|
|
3
5
|
module Datasets
|
@@ -23,10 +25,10 @@ module Datasets
|
|
23
25
|
def initialize
|
24
26
|
super
|
25
27
|
species = self.class.name.split("::").last.downcase
|
26
|
-
@metadata.id = "palmerpenguins
|
28
|
+
@metadata.id = "palmerpenguins-#{species}"
|
27
29
|
@metadata.url = self.class::URL
|
28
|
-
@metadata.licenses = ["CC0"]
|
29
|
-
@data_path = cache_dir_path + "
|
30
|
+
@metadata.licenses = ["CC0-1.0"]
|
31
|
+
@data_path = cache_dir_path + "#{species}.csv"
|
30
32
|
end
|
31
33
|
|
32
34
|
attr_reader :data_path
|
@@ -44,15 +46,11 @@ module Datasets
|
|
44
46
|
end
|
45
47
|
|
46
48
|
private def open_data
|
47
|
-
download
|
49
|
+
download(data_path, metadata.url)
|
48
50
|
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
49
51
|
yield csv
|
50
52
|
end
|
51
53
|
end
|
52
|
-
|
53
|
-
private def download
|
54
|
-
super(data_path, metadata.url)
|
55
|
-
end
|
56
54
|
end
|
57
55
|
|
58
56
|
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
|
@@ -36,10 +36,8 @@ module Datasets
|
|
36
36
|
|
37
37
|
base_name = "ptb.#{@type}.txt"
|
38
38
|
data_path = cache_dir_path + base_name
|
39
|
-
|
40
|
-
|
41
|
-
download(data_path, "#{base_url}/#{base_name}")
|
42
|
-
end
|
39
|
+
base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
|
40
|
+
download(data_path, "#{base_url}/#{base_name}")
|
43
41
|
|
44
42
|
parse_data(data_path, &block)
|
45
43
|
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class PMJTDatasetList < Dataset
|
5
|
+
Record = Struct.new(:unit,
|
6
|
+
:open_data_category,
|
7
|
+
:tag,
|
8
|
+
:release_time,
|
9
|
+
:n_volumes,
|
10
|
+
:type,
|
11
|
+
:publication_year,
|
12
|
+
:original_request_code,
|
13
|
+
:id,
|
14
|
+
:title,
|
15
|
+
:text,
|
16
|
+
:bibliographical_introduction,
|
17
|
+
:year)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super()
|
21
|
+
@metadata.id = "pmjt-dataset-list"
|
22
|
+
@metadata.name = "List of pre-modern Japanese text dataset"
|
23
|
+
@metadata.url = "http://codh.rois.ac.jp/pmjt/"
|
24
|
+
@metadata.licenses = ["CC-BY-SA-4.0"]
|
25
|
+
@metadata.description = <<~DESCRIPTION
|
26
|
+
Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data.
|
27
|
+
In addition, some text has description, transcription, and tagging data.
|
28
|
+
DESCRIPTION
|
29
|
+
|
30
|
+
@data_path = cache_dir_path + (@metadata.id + ".csv")
|
31
|
+
end
|
32
|
+
|
33
|
+
def each(&block)
|
34
|
+
return to_enum(__method__) unless block_given?
|
35
|
+
|
36
|
+
latest_version = "201901"
|
37
|
+
url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv"
|
38
|
+
download(@data_path, url)
|
39
|
+
CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv|
|
40
|
+
csv.each do |row|
|
41
|
+
record = create_record(row)
|
42
|
+
yield record
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def create_record(csv_row)
|
49
|
+
record = Record.new
|
50
|
+
record.unit = csv_row["(単位)"]
|
51
|
+
record.open_data_category = csv_row["オープンデータ分類"]
|
52
|
+
record.tag = csv_row["タグ"]
|
53
|
+
record.release_time = csv_row["公開時期"]
|
54
|
+
record.n_volumes = csv_row["冊数等"]
|
55
|
+
record.type = csv_row["刊・写"]
|
56
|
+
record.publication_year = csv_row["刊年・書写年"]
|
57
|
+
record.original_request_code = csv_row["原本請求記号"]
|
58
|
+
record.id = csv_row["国文研書誌ID"]
|
59
|
+
record.title = csv_row["書名(統一書名)"]
|
60
|
+
record.text = csv_row["本文"]
|
61
|
+
record.bibliographical_introduction = csv_row["解題"]
|
62
|
+
record.year = csv_row["(西暦)"]
|
63
|
+
|
64
|
+
record
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -49,9 +49,7 @@ module Datasets
|
|
49
49
|
@metadata.id = "postal-code-japan-#{@reading}"
|
50
50
|
@metadata.name = "Postal code in Japan (#{@reading})"
|
51
51
|
@metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
|
52
|
-
@metadata.licenses = [
|
53
|
-
"CC0-1.0",
|
54
|
-
]
|
52
|
+
@metadata.licenses = ["CC0-1.0"]
|
55
53
|
@metadata.description = "Postal code in Japan (reading: #{@reading})"
|
56
54
|
end
|
57
55
|
|
@@ -116,9 +114,7 @@ module Datasets
|
|
116
114
|
data_url << "/roman/ken_all_rome.zip"
|
117
115
|
end
|
118
116
|
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
119
|
-
|
120
|
-
download(data_path, data_url)
|
121
|
-
end
|
117
|
+
download(data_path, data_url)
|
122
118
|
|
123
119
|
Zip::File.open(data_path.to_s) do |zip_file|
|
124
120
|
zip_file.each do |entry|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class QuoraDuplicateQuestionPair < Dataset
|
7
|
+
class Record < Struct.new(:id,
|
8
|
+
:first_question_id,
|
9
|
+
:second_question_id,
|
10
|
+
:first_question,
|
11
|
+
:second_question,
|
12
|
+
:duplicated)
|
13
|
+
alias_method :duplicated?, :duplicated
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
super()
|
18
|
+
@metadata.id = "quora-duplicate-question-pair"
|
19
|
+
@metadata.name = "Quora's duplicated question pair dataset"
|
20
|
+
@metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs"
|
21
|
+
@metadata.licenses = [
|
22
|
+
{
|
23
|
+
name: "Quora's Terms of Service",
|
24
|
+
url: "https://www.quora.com/about/tos",
|
25
|
+
}
|
26
|
+
]
|
27
|
+
end
|
28
|
+
|
29
|
+
def each
|
30
|
+
return to_enum(__method__) unless block_given?
|
31
|
+
|
32
|
+
open_data do |csv|
|
33
|
+
csv.each do |row|
|
34
|
+
row["is_duplicate"] = (row["is_duplicate"] == 1)
|
35
|
+
record = Record.new(*row.fields)
|
36
|
+
yield(record)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def open_data
|
43
|
+
data_path = cache_dir_path + "quora_duplicate_questions.tsv"
|
44
|
+
data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
|
45
|
+
download(data_path, data_url)
|
46
|
+
CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
|
47
|
+
yield(csv)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|