red-datasets 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +86 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class PMJTDatasetList < Dataset
|
5
|
+
Record = Struct.new(:unit,
|
6
|
+
:open_data_category,
|
7
|
+
:tag,
|
8
|
+
:release_time,
|
9
|
+
:n_volumes,
|
10
|
+
:type,
|
11
|
+
:publication_year,
|
12
|
+
:original_request_code,
|
13
|
+
:id,
|
14
|
+
:title,
|
15
|
+
:text,
|
16
|
+
:bibliographical_introduction,
|
17
|
+
:year)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super()
|
21
|
+
@metadata.id = "pmjt-dataset-list"
|
22
|
+
@metadata.name = "List of pre-modern Japanese text dataset"
|
23
|
+
@metadata.url = "http://codh.rois.ac.jp/pmjt/"
|
24
|
+
@metadata.licenses = ["CC-BY-SA-4.0"]
|
25
|
+
@metadata.description = <<~DESCRIPTION
|
26
|
+
Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data.
|
27
|
+
In addition, some text has description, transcription, and tagging data.
|
28
|
+
DESCRIPTION
|
29
|
+
|
30
|
+
@data_path = cache_dir_path + (@metadata.id + ".csv")
|
31
|
+
end
|
32
|
+
|
33
|
+
def each(&block)
|
34
|
+
return to_enum(__method__) unless block_given?
|
35
|
+
|
36
|
+
latest_version = "201901"
|
37
|
+
url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv"
|
38
|
+
download(@data_path, url)
|
39
|
+
CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv|
|
40
|
+
csv.each do |row|
|
41
|
+
record = create_record(row)
|
42
|
+
yield record
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def create_record(csv_row)
|
49
|
+
record = Record.new
|
50
|
+
record.unit = csv_row["(単位)"]
|
51
|
+
record.open_data_category = csv_row["オープンデータ分類"]
|
52
|
+
record.tag = csv_row["タグ"]
|
53
|
+
record.release_time = csv_row["公開時期"]
|
54
|
+
record.n_volumes = csv_row["冊数等"]
|
55
|
+
record.type = csv_row["刊・写"]
|
56
|
+
record.publication_year = csv_row["刊年・書写年"]
|
57
|
+
record.original_request_code = csv_row["原本請求記号"]
|
58
|
+
record.id = csv_row["国文研書誌ID"]
|
59
|
+
record.title = csv_row["書名(統一書名)"]
|
60
|
+
record.text = csv_row["本文"]
|
61
|
+
record.bibliographical_introduction = csv_row["解題"]
|
62
|
+
record.year = csv_row["(西暦)"]
|
63
|
+
|
64
|
+
record
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -49,9 +49,7 @@ module Datasets
|
|
49
49
|
@metadata.id = "postal-code-japan-#{@reading}"
|
50
50
|
@metadata.name = "Postal code in Japan (#{@reading})"
|
51
51
|
@metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
|
52
|
-
@metadata.licenses = [
|
53
|
-
"CC0-1.0",
|
54
|
-
]
|
52
|
+
@metadata.licenses = ["CC0-1.0"]
|
55
53
|
@metadata.description = "Postal code in Japan (reading: #{@reading})"
|
56
54
|
end
|
57
55
|
|
@@ -116,9 +114,7 @@ module Datasets
|
|
116
114
|
data_url << "/roman/ken_all_rome.zip"
|
117
115
|
end
|
118
116
|
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
119
|
-
|
120
|
-
download(data_path, data_url)
|
121
|
-
end
|
117
|
+
download(data_path, data_url)
|
122
118
|
|
123
119
|
Zip::File.open(data_path.to_s) do |zip_file|
|
124
120
|
zip_file.each do |entry|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class QuoraDuplicateQuestionPair < Dataset
|
7
|
+
class Record < Struct.new(:id,
|
8
|
+
:first_question_id,
|
9
|
+
:second_question_id,
|
10
|
+
:first_question,
|
11
|
+
:second_question,
|
12
|
+
:duplicated)
|
13
|
+
alias_method :duplicated?, :duplicated
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
super()
|
18
|
+
@metadata.id = "quora-duplicate-question-pair"
|
19
|
+
@metadata.name = "Quora's duplicated question pair dataset"
|
20
|
+
@metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs"
|
21
|
+
@metadata.licenses = [
|
22
|
+
{
|
23
|
+
name: "Quora's Terms of Service",
|
24
|
+
url: "https://www.quora.com/about/tos",
|
25
|
+
}
|
26
|
+
]
|
27
|
+
end
|
28
|
+
|
29
|
+
def each
|
30
|
+
return to_enum(__method__) unless block_given?
|
31
|
+
|
32
|
+
open_data do |csv|
|
33
|
+
csv.each do |row|
|
34
|
+
row["is_duplicate"] = (row["is_duplicate"] == 1)
|
35
|
+
record = Record.new(*row.fields)
|
36
|
+
yield(record)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def open_data
|
43
|
+
data_path = cache_dir_path + "quora_duplicate_questions.tsv"
|
44
|
+
data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
|
45
|
+
download(data_path, data_url)
|
46
|
+
CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
|
47
|
+
yield(csv)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -2,7 +2,7 @@ require_relative "dataset"
|
|
2
2
|
require_relative "tar-gz-readable"
|
3
3
|
|
4
4
|
module Datasets
|
5
|
-
class
|
5
|
+
class RdatasetList < Dataset
|
6
6
|
Record = Struct.new(:package,
|
7
7
|
:dataset,
|
8
8
|
:title,
|
@@ -18,8 +18,8 @@ module Datasets
|
|
18
18
|
|
19
19
|
def initialize
|
20
20
|
super
|
21
|
-
@metadata.id = "
|
22
|
-
@metadata.name = "
|
21
|
+
@metadata.id = "rdataset-list"
|
22
|
+
@metadata.name = "Rdataset"
|
23
23
|
@metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
|
24
24
|
@metadata.licenses = ["GPL-3"]
|
25
25
|
@data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
|
@@ -48,16 +48,19 @@ module Datasets
|
|
48
48
|
end
|
49
49
|
|
50
50
|
private def each_row(&block)
|
51
|
-
download(@data_path, @data_url)
|
51
|
+
download(@data_path, @data_url)
|
52
52
|
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
53
53
|
csv.each(&block)
|
54
54
|
end
|
55
55
|
end
|
56
56
|
end
|
57
57
|
|
58
|
-
|
58
|
+
# For backward compatibility
|
59
|
+
RdatasetsList = RdatasetList
|
60
|
+
|
61
|
+
class Rdataset < Dataset
|
59
62
|
def initialize(package_name, dataset_name)
|
60
|
-
list =
|
63
|
+
list = RdatasetList.new
|
61
64
|
|
62
65
|
info = list.filter(package: package_name, dataset: dataset_name).first
|
63
66
|
unless info
|
@@ -65,8 +68,8 @@ module Datasets
|
|
65
68
|
end
|
66
69
|
|
67
70
|
super()
|
68
|
-
@metadata.id = "
|
69
|
-
@metadata.name = "
|
71
|
+
@metadata.id = "rdataset-#{package_name}-#{dataset_name}"
|
72
|
+
@metadata.name = "Rdataset: #{package_name}: #{dataset_name}"
|
70
73
|
@metadata.url = info.csv
|
71
74
|
@metadata.licenses = ["GPL-3"]
|
72
75
|
@metadata.description = info.title
|
@@ -81,15 +84,63 @@ module Datasets
|
|
81
84
|
def each(&block)
|
82
85
|
return to_enum(__method__) unless block_given?
|
83
86
|
|
84
|
-
download(@data_path, @metadata.url)
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
87
|
+
download(@data_path, @metadata.url)
|
88
|
+
|
89
|
+
na_converter = lambda do |field|
|
90
|
+
begin
|
91
|
+
if field.encode(CSV::ConverterEncoding) == "NA"
|
92
|
+
nil
|
93
|
+
else
|
94
|
+
field
|
95
|
+
end
|
96
|
+
rescue
|
97
|
+
field
|
91
98
|
end
|
92
99
|
end
|
100
|
+
|
101
|
+
inf_converter = lambda do |field|
|
102
|
+
begin
|
103
|
+
if field.encode(CSV::ConverterEncoding) == "Inf"
|
104
|
+
Float::INFINITY
|
105
|
+
else
|
106
|
+
field
|
107
|
+
end
|
108
|
+
rescue
|
109
|
+
field
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
quote_preserving_converter = lambda do |field, info|
|
114
|
+
f = field.encode(CSV::ConverterEncoding)
|
115
|
+
return f if info.quoted?
|
116
|
+
|
117
|
+
begin
|
118
|
+
begin
|
119
|
+
begin
|
120
|
+
return DateTime.parse(f) if f.match?(DateTimeMatcher)
|
121
|
+
rescue
|
122
|
+
return Integer(f)
|
123
|
+
end
|
124
|
+
rescue
|
125
|
+
return Float(f)
|
126
|
+
end
|
127
|
+
rescue
|
128
|
+
field
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
table = CSV.table(@data_path,
|
133
|
+
header_converters: [:symbol_raw],
|
134
|
+
# quote_preserving_converter should be the last
|
135
|
+
converters: [na_converter, inf_converter, quote_preserving_converter])
|
136
|
+
table.delete(:"") # delete 1st column for indices.
|
137
|
+
|
138
|
+
table.each do |row|
|
139
|
+
yield row.to_h
|
140
|
+
end
|
93
141
|
end
|
94
142
|
end
|
143
|
+
|
144
|
+
# For backward compatibility
|
145
|
+
Rdatasets = Rdataset
|
95
146
|
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require "json"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class SeabornList < Dataset
|
5
|
+
def initialize
|
6
|
+
super
|
7
|
+
@metadata.id = "seaborn-data-list"
|
8
|
+
@metadata.name = "seaborn: data list"
|
9
|
+
@metadata.url = "https://github.com/mwaskom/seaborn-data"
|
10
|
+
# Treat as the same license as seaborn
|
11
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
12
|
+
@metadata.description = "Datasets for seaborn examples."
|
13
|
+
end
|
14
|
+
|
15
|
+
def each(&block)
|
16
|
+
return to_enum(__method__) unless block_given?
|
17
|
+
|
18
|
+
data_path = cache_dir_path + "trees.json"
|
19
|
+
url = "https://api.github.com/repos/mwaskom/seaborn-data/git/trees/master"
|
20
|
+
download(data_path, url)
|
21
|
+
|
22
|
+
tree = JSON.parse(File.read(data_path))["tree"]
|
23
|
+
tree.each do |content|
|
24
|
+
path = content["path"]
|
25
|
+
next unless path.end_with?(".csv")
|
26
|
+
dataset = File.basename(path, ".csv")
|
27
|
+
record = {dataset: dataset}
|
28
|
+
yield record
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Seaborn < Dataset
|
34
|
+
URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
|
35
|
+
|
36
|
+
def initialize(name)
|
37
|
+
super()
|
38
|
+
@metadata.id = "seaborn-#{name}"
|
39
|
+
@metadata.name = "seaborn: #{name}"
|
40
|
+
@metadata.url = URL_FORMAT % {name: name}
|
41
|
+
# @metadata.licenses = TODO
|
42
|
+
|
43
|
+
@name = name
|
44
|
+
end
|
45
|
+
|
46
|
+
def each(&block)
|
47
|
+
return to_enum(__method__) unless block_given?
|
48
|
+
|
49
|
+
data_path = cache_dir_path + "#{@name}.csv"
|
50
|
+
download(data_path, @metadata.url)
|
51
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
52
|
+
csv.each do |row|
|
53
|
+
record = prepare_record(row)
|
54
|
+
yield record
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
def prepare_record(csv_row)
|
61
|
+
record = csv_row.to_h
|
62
|
+
record.transform_keys! do |key|
|
63
|
+
if key.nil?
|
64
|
+
:index
|
65
|
+
else
|
66
|
+
key.to_sym
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Perform the same preprocessing as seaborn's load_dataset function
|
71
|
+
preprocessor = :"preprocess_#{@name}_record"
|
72
|
+
__send__(preprocessor, record) if respond_to?(preprocessor, true)
|
73
|
+
|
74
|
+
record
|
75
|
+
end
|
76
|
+
|
77
|
+
# The same preprocessing as seaborn.load_dataset
|
78
|
+
def preprocess_flights_record(record)
|
79
|
+
record[:month] &&= record[:month][0,3]
|
80
|
+
end
|
81
|
+
|
82
|
+
# The same preprocessing as seaborn.load_dataset
|
83
|
+
def preprocess_penguins_record(record)
|
84
|
+
record[:sex] &&= record[:sex].capitalize
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# For backward compatibility
|
89
|
+
SeabornData = Seaborn
|
90
|
+
end
|
@@ -21,9 +21,7 @@ module Datasets
|
|
21
21
|
@metadata.id = "sudachi-synonym-dictionary"
|
22
22
|
@metadata.name = "Sudachi synonym dictionary"
|
23
23
|
@metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
|
24
|
-
@metadata.licenses = [
|
25
|
-
"Apache-2.0",
|
26
|
-
]
|
24
|
+
@metadata.licenses = ["Apache-2.0"]
|
27
25
|
@metadata.description = lambda do
|
28
26
|
download_description
|
29
27
|
end
|
@@ -65,10 +63,8 @@ module Datasets
|
|
65
63
|
private
|
66
64
|
def open_data
|
67
65
|
data_path = cache_dir_path + "synonyms.txt"
|
68
|
-
|
69
|
-
|
70
|
-
download(data_path, data_url)
|
71
|
-
end
|
66
|
+
data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
|
67
|
+
download(data_path, data_url)
|
72
68
|
CSV.open(data_path,
|
73
69
|
encoding: "UTF-8",
|
74
70
|
skip_blanks: true) do |csv|
|
@@ -78,10 +74,8 @@ module Datasets
|
|
78
74
|
|
79
75
|
def download_description
|
80
76
|
description_path = cache_dir_path + "synonyms.md"
|
81
|
-
|
82
|
-
|
83
|
-
download(description_path, description_url)
|
84
|
-
end
|
77
|
+
description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
|
78
|
+
download(description_path, description_url)
|
85
79
|
description_path.read
|
86
80
|
end
|
87
81
|
|
data/lib/datasets/version.rb
CHANGED
@@ -0,0 +1,219 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "rexml/streamlistener"
|
3
|
+
require "rexml/parsers/baseparser"
|
4
|
+
require "rexml/parsers/streamparser"
|
5
|
+
require "time"
|
6
|
+
|
7
|
+
require_relative "dataset"
|
8
|
+
require_relative "tar-gz-readable"
|
9
|
+
|
10
|
+
module Datasets
|
11
|
+
class WikipediaKyotoJapaneseEnglish < Dataset
|
12
|
+
include TarGzReadable
|
13
|
+
|
14
|
+
Article = Struct.new(:source,
|
15
|
+
:copyright,
|
16
|
+
:contents,
|
17
|
+
:sections)
|
18
|
+
|
19
|
+
Section = Struct.new(:id,
|
20
|
+
:title,
|
21
|
+
:contents)
|
22
|
+
|
23
|
+
class Title < Struct.new(:section,
|
24
|
+
:japanese,
|
25
|
+
:english)
|
26
|
+
def title?
|
27
|
+
true
|
28
|
+
end
|
29
|
+
|
30
|
+
def sentence?
|
31
|
+
false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
Paragraph = Struct.new(:id,
|
36
|
+
:sentences)
|
37
|
+
|
38
|
+
class Sentence < Struct.new(:id,
|
39
|
+
:section,
|
40
|
+
:paragraph,
|
41
|
+
:japanese,
|
42
|
+
:english)
|
43
|
+
def title?
|
44
|
+
false
|
45
|
+
end
|
46
|
+
|
47
|
+
def sentence?
|
48
|
+
true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
Entry = Struct.new(:japanese,
|
53
|
+
:english)
|
54
|
+
|
55
|
+
def initialize(type: :article)
|
56
|
+
unless [:article, :lexicon].include?(type)
|
57
|
+
raise ArgumentError, "Please set type :article or :lexicon: #{type.inspect}"
|
58
|
+
end
|
59
|
+
|
60
|
+
super()
|
61
|
+
@type = type
|
62
|
+
@metadata.id = "wikipedia-kyoto-japanese-english"
|
63
|
+
@metadata.name =
|
64
|
+
"The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
|
65
|
+
@metadata.url = "https://alaginrc.nict.go.jp/WikiCorpus/index_E.html"
|
66
|
+
@metadata.licenses = ["CC-BY-SA-3.0"]
|
67
|
+
@metadata.description = <<-DESCRIPTION
|
68
|
+
"The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
|
69
|
+
aims mainly at supporting research and development relevant to
|
70
|
+
high-performance multilingual machine translation, information
|
71
|
+
extraction, and other language processing technologies. The National
|
72
|
+
Institute of Information and Communications Technology (NICT) has
|
73
|
+
created this corpus by manually translating Japanese Wikipedia
|
74
|
+
articles (related to Kyoto) into English.
|
75
|
+
DESCRIPTION
|
76
|
+
end
|
77
|
+
|
78
|
+
def each(&block)
|
79
|
+
return to_enum(__method__) unless block_given?
|
80
|
+
|
81
|
+
data_path = download_tar_gz
|
82
|
+
|
83
|
+
open_tar_gz(data_path) do |tar|
|
84
|
+
tar.each do |entry|
|
85
|
+
next unless entry.file?
|
86
|
+
base_name = File.basename(entry.full_name)
|
87
|
+
case @type
|
88
|
+
when :article
|
89
|
+
next unless base_name.end_with?(".xml")
|
90
|
+
listener = ArticleListener.new(block)
|
91
|
+
parser = REXML::Parsers::StreamParser.new(entry.read, listener)
|
92
|
+
parser.parse
|
93
|
+
when :lexicon
|
94
|
+
next unless base_name == "kyoto_lexicon.csv"
|
95
|
+
is_header = true
|
96
|
+
CSV.parse(entry.read.force_encoding("UTF-8")) do |row|
|
97
|
+
if is_header
|
98
|
+
is_header = false
|
99
|
+
next
|
100
|
+
end
|
101
|
+
yield(Entry.new(row[0], row[1]))
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
private
|
109
|
+
def download_tar_gz
|
110
|
+
base_name = "wiki_corpus_2.01.tar.gz"
|
111
|
+
data_path = cache_dir_path + base_name
|
112
|
+
data_url = "https://alaginrc.nict.go.jp/WikiCorpus/src/#{base_name}"
|
113
|
+
download(data_path, data_url)
|
114
|
+
data_path
|
115
|
+
end
|
116
|
+
|
117
|
+
class ArticleListener
|
118
|
+
include REXML::StreamListener
|
119
|
+
|
120
|
+
def initialize(block)
|
121
|
+
@block = block
|
122
|
+
@article = nil
|
123
|
+
@title = nil
|
124
|
+
@section = nil
|
125
|
+
@page = nil
|
126
|
+
@sentence = nil
|
127
|
+
@text_container_stack = []
|
128
|
+
@element_stack = []
|
129
|
+
@text_stack = [""]
|
130
|
+
end
|
131
|
+
|
132
|
+
def tag_start(name, attributes)
|
133
|
+
push_stacks(name, attributes)
|
134
|
+
case name
|
135
|
+
when "art"
|
136
|
+
@article = Article.new
|
137
|
+
@article.contents = []
|
138
|
+
@article.sections = []
|
139
|
+
when "tit"
|
140
|
+
@title = Title.new
|
141
|
+
@title.section = @section
|
142
|
+
@text_container_stack.push(@title)
|
143
|
+
when "sec"
|
144
|
+
@section = Section.new
|
145
|
+
@section.id = attributes["id"]
|
146
|
+
@section.contents = []
|
147
|
+
@text_container_stack.push(@section)
|
148
|
+
when "par"
|
149
|
+
@paragraph = Paragraph.new
|
150
|
+
@paragraph.id = attributes["id"]
|
151
|
+
@paragraph.sentences = []
|
152
|
+
@text_container_stack.push(@paragraph)
|
153
|
+
when "sen"
|
154
|
+
@sentence = Sentence.new
|
155
|
+
@sentence.id = attributes["id"]
|
156
|
+
@text_container_stack.push(@sentence)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def tag_end(name)
|
161
|
+
case name
|
162
|
+
when "art"
|
163
|
+
@block.call(@article)
|
164
|
+
@article = nil
|
165
|
+
when "inf"
|
166
|
+
@article.source = @text_stack.last
|
167
|
+
when "copyright"
|
168
|
+
@article.copyright = @text_stack.last
|
169
|
+
when "tit"
|
170
|
+
@article.contents << @title
|
171
|
+
if @section
|
172
|
+
@section.title = @title
|
173
|
+
@section.contents << @title
|
174
|
+
end
|
175
|
+
@title = nil
|
176
|
+
@text_container_stack.pop
|
177
|
+
when "sec"
|
178
|
+
@article.sections << @section
|
179
|
+
@section = nil
|
180
|
+
@text_container_stack.pop
|
181
|
+
when "par"
|
182
|
+
@paragraph = nil
|
183
|
+
@text_container_stack.pop
|
184
|
+
when "sen"
|
185
|
+
@article.contents << @sentence
|
186
|
+
@sentence.section = @section
|
187
|
+
@section.contents << @sentence if @section
|
188
|
+
@sentence.paragraph = @paragraph
|
189
|
+
@paragraph.sentences << @sentence if @paragraph
|
190
|
+
@sentence = nil
|
191
|
+
@text_container_stack.pop
|
192
|
+
when "j"
|
193
|
+
@text_container_stack.last.japanese = @text_stack.last
|
194
|
+
when "e"
|
195
|
+
attributes = @element_stack.last[:attributes]
|
196
|
+
if attributes["type"] == "check"
|
197
|
+
@text_container_stack.last.english = @text_stack.last
|
198
|
+
end
|
199
|
+
end
|
200
|
+
pop_stacks
|
201
|
+
end
|
202
|
+
|
203
|
+
def text(data)
|
204
|
+
@text_stack.last << data
|
205
|
+
end
|
206
|
+
|
207
|
+
private
|
208
|
+
def push_stacks(name, attributes)
|
209
|
+
@element_stack.push({name: name, attributes: attributes})
|
210
|
+
@text_stack.push("")
|
211
|
+
end
|
212
|
+
|
213
|
+
def pop_stacks
|
214
|
+
@text_stack.pop
|
215
|
+
@element_stack.pop
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
data/lib/datasets/wikipedia.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "rexml/streamlistener"
|
2
2
|
require "rexml/parsers/baseparser"
|
3
3
|
require "rexml/parsers/streamparser"
|
4
|
+
require "time"
|
4
5
|
|
5
6
|
require_relative "dataset"
|
6
7
|
|
@@ -55,10 +56,8 @@ module Datasets
|
|
55
56
|
def open_data(&block)
|
56
57
|
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
57
58
|
data_path = cache_dir_path + base_name
|
58
|
-
|
59
|
-
|
60
|
-
download(data_path, data_url)
|
61
|
-
end
|
59
|
+
data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
|
60
|
+
download(data_path, data_url)
|
62
61
|
|
63
62
|
extract_bz2(data_path, &block)
|
64
63
|
end
|
@@ -153,7 +152,7 @@ module Datasets
|
|
153
152
|
@text_stack.last << data
|
154
153
|
end
|
155
154
|
|
156
|
-
def cdata(
|
155
|
+
def cdata(content)
|
157
156
|
@text_stack.last << content
|
158
157
|
end
|
159
158
|
|
data/lib/datasets/wine.rb
CHANGED
@@ -23,7 +23,8 @@ module Datasets
|
|
23
23
|
super
|
24
24
|
@metadata.id = 'wine'
|
25
25
|
@metadata.name = 'Wine'
|
26
|
-
@metadata.url = '
|
26
|
+
@metadata.url = 'https://archive.ics.uci.edu/ml/datasets/wine'
|
27
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
27
28
|
@metadata.description = -> { read_names }
|
28
29
|
end
|
29
30
|
|
@@ -43,19 +44,15 @@ module Datasets
|
|
43
44
|
|
44
45
|
def read_names
|
45
46
|
names_path = cache_dir_path + 'wine.names'
|
46
|
-
|
47
|
-
|
48
|
-
download(names_path, names_url)
|
49
|
-
end
|
47
|
+
names_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
|
48
|
+
download(names_path, names_url)
|
50
49
|
names_path.read
|
51
50
|
end
|
52
51
|
|
53
52
|
def open_data
|
54
53
|
data_path = cache_dir_path + 'wine.data'
|
55
|
-
|
56
|
-
|
57
|
-
download(data_path, data_url)
|
58
|
-
end
|
54
|
+
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
|
55
|
+
download(data_path, data_url)
|
59
56
|
CSV.open(data_path, converters: %i[numeric]) do |csv|
|
60
57
|
yield(csv)
|
61
58
|
end
|