red-datasets 0.1.4 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,90 @@
1
+ require_relative "version"
2
+
3
+ module Datasets
4
+ class LazyLoader
5
+ def initialize
6
+ @constants = {}
7
+ end
8
+
9
+ def exist?(constant_name)
10
+ @constants.key?(constant_name)
11
+ end
12
+
13
+ def load(constant_name)
14
+ feature = @constants[constant_name]
15
+ raise LoadError, "unknown dataset: #{constant_name}" unless feature
16
+ require feature
17
+ end
18
+
19
+ def load_all
20
+ @constants.each_value do |feature|
21
+ require feature
22
+ end
23
+ end
24
+
25
+ def register(constant_name, feature)
26
+ @constants[constant_name] = feature
27
+ end
28
+
29
+ def constant_names
30
+ @constants.keys
31
+ end
32
+ end
33
+
34
+ LAZY_LOADER = LazyLoader.new
35
+
36
+ class << self
37
+ def const_missing(name)
38
+ if LAZY_LOADER.exist?(name)
39
+ LAZY_LOADER.load(name)
40
+ const_get(name)
41
+ else
42
+ super
43
+ end
44
+ end
45
+ end
46
+
47
+ LAZY_LOADER.register(:Adult, "datasets/adult")
48
+ LAZY_LOADER.register(:AFINN, "datasets/afinn")
49
+ LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
50
+ LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
51
+ LAZY_LOADER.register(:CIFAR, "datasets/cifar")
52
+ LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
53
+ LAZY_LOADER.register(:Communities, "datasets/communities")
54
+ LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
55
+ LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
56
+ LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
57
+ LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
58
+ LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
59
+ LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
60
+ LAZY_LOADER.register(:Iris, "datasets/iris")
61
+ LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
62
+ LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
63
+ LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
64
+ LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
65
+ LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
66
+ LAZY_LOADER.register(:MNIST, "datasets/mnist")
67
+ LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
68
+ LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
69
+ "datasets/nagoya-university-conversation-corpus")
70
+ LAZY_LOADER.register(:Penguins, "datasets/penguins")
71
+ LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
72
+ LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
73
+ LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
74
+ LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
75
+ "datasets/quora-duplicate-question-pair")
76
+ LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
77
+ # For backward compatibility
78
+ LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
79
+ LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
80
+ # For backward compatibility
81
+ LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
82
+ LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
83
+ LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
84
+ LAZY_LOADER.register(:SudachiSynonymDictionary,
85
+ "datasets/sudachi-synonym-dictionary")
86
+ LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
87
+ LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
88
+ "datasets/wikipedia-kyoto-japanese-english")
89
+ LAZY_LOADER.register(:Wine, "datasets/wine")
90
+ end
@@ -28,6 +28,7 @@ module Datasets
28
28
  @metadata.id = "libsvm-dataset-list"
29
29
  @metadata.name = "LIBSVM dataset list"
30
30
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
31
+ @metadata.licenses = ["BSD-3-Clause"]
31
32
  @metadata.description = lambda do
32
33
  extract_description
33
34
  end
@@ -51,10 +52,8 @@ module Datasets
51
52
  private
52
53
  def open_data
53
54
  data_path = cache_dir_path + "index.html"
54
- unless data_path.exist?
55
- download(data_path, @metadata.url)
56
- end
57
- ::File.open(data_path) do |input|
55
+ download(data_path, @metadata.url)
56
+ data_path.open do |input|
58
57
  yield(input)
59
58
  end
60
59
  end
@@ -78,10 +77,8 @@ module Datasets
78
77
 
79
78
  def open_detail(detail)
80
79
  data_path = cache_dir_path + detail
81
- unless data_path.exist?
82
- download(data_path, @metadata.url + detail)
83
- end
84
- ::File.open(data_path) do |input|
80
+ download(data_path, @metadata.url + detail)
81
+ data_path.open do |input|
85
82
  yield(input)
86
83
  end
87
84
  end
@@ -41,6 +41,7 @@ module Datasets
41
41
  @metadata.id = "libsvm-#{normalize_name(name)}"
42
42
  @metadata.name = "LIBSVM dataset: #{name}"
43
43
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ @metadata.licenses = ["BSD-3-Clause"]
44
45
  end
45
46
 
46
47
  def each
@@ -99,13 +100,11 @@ module Datasets
99
100
 
100
101
  def open_data(&block)
101
102
  data_path = cache_dir_path + @file.name
102
- unless data_path.exist?
103
- download(data_path, @file.url)
104
- end
103
+ download(data_path, @file.url)
105
104
  if data_path.extname == ".bz2"
106
105
  extract_bz2(data_path, &block)
107
106
  else
108
- File.open(data_path, &block)
107
+ data_path.open(&block)
109
108
  end
110
109
  end
111
110
 
@@ -0,0 +1,26 @@
1
+ module Datasets
2
+ class License < Struct.new(:spdx_id,
3
+ :name,
4
+ :url)
5
+ class << self
6
+ def try_convert(value)
7
+ case value
8
+ when self
9
+ value
10
+ when String
11
+ license = new
12
+ license.spdx_id = value
13
+ license
14
+ when Hash
15
+ license = new
16
+ license.spdx_id = value[:spdx_id]
17
+ license.name = value[:name]
18
+ license.url = value[:url]
19
+ license
20
+ else
21
+ nil
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,80 @@
1
+ require_relative "dataset"
2
+ require_relative "tar-gz-readable"
3
+
4
+ module Datasets
5
+ class LivedoorNews < Dataset
6
+ include TarGzReadable
7
+ Record = Struct.new(:url,
8
+ :timestamp,
9
+ :sentence)
10
+
11
+ def initialize(type: :topic_news)
12
+ news_list = [
13
+ :topic_news,
14
+ :sports_watch,
15
+ :it_life_hack,
16
+ :kaden_channel,
17
+ :movie_enter,
18
+ :dokujo_tsushin,
19
+ :smax,
20
+ :livedoor_homme,
21
+ :peachy
22
+ ]
23
+ unless news_list.include?(type)
24
+ valid_type_labels = news_list.collect(&:inspect).join(", ")
25
+ message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
26
+ raise ArgumentError, message
27
+ end
28
+
29
+ super()
30
+ @type = type
31
+ @metadata.id = 'livedoor-news'
32
+ @metadata.name = 'livedoor-news'
33
+ @metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
34
+ @metadata.licenses = ['CC-BY-ND-2.1-JP']
35
+ @metadata.description = lambda do
36
+ fetch_readme
37
+ end
38
+ end
39
+
40
+ def each(&block)
41
+ return to_enum(__method__) unless block_given?
42
+
43
+ data_path = download_tar_gz
44
+ parse_data(data_path, &block)
45
+ end
46
+
47
+ private
48
+ def download_tar_gz
49
+ data_path = cache_dir_path + "livedoor-news.tar.gz"
50
+ data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
51
+ download(data_path, data_url)
52
+ data_path
53
+ end
54
+
55
+ def fetch_readme
56
+ data_path = download_tar_gz
57
+ target_file_name = 'text/README.txt'
58
+ open_tar_gz(data_path) do |tar|
59
+ tar.seek(target_file_name) do |entry|
60
+ return entry.read.force_encoding("UTF-8")
61
+ end
62
+ end
63
+ end
64
+
65
+ def parse_data(data_path, &block)
66
+ target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
67
+ open_tar_gz(data_path) do |tar|
68
+ tar.each do |entry|
69
+ next unless entry.file?
70
+ directory_name, base_name = File.split(entry.full_name)
71
+ next unless directory_name == target_directory_name
72
+ next if base_name == "LICENSE.txt"
73
+ url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
74
+ record = Record.new(url, Time.iso8601(timestamp), sentence)
75
+ yield(record)
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -1,9 +1,23 @@
1
+ require_relative "license"
2
+
1
3
  module Datasets
2
4
  class Metadata < Struct.new(:id,
3
5
  :name,
4
6
  :url,
5
7
  :licenses,
6
8
  :description)
9
+ def licenses=(licenses)
10
+ licenses = [licenses] unless licenses.is_a?(Array)
11
+ licenses = licenses.collect do |license|
12
+ l = License.try_convert(license)
13
+ if l.nil?
14
+ raise ArgumentError.new("invalid license: #{license.inspect}")
15
+ end
16
+ l
17
+ end
18
+ super(licenses)
19
+ end
20
+
7
21
  def description
8
22
  description_raw = super
9
23
  if description_raw.respond_to?(:call)
@@ -28,6 +28,7 @@ module Datasets
28
28
  @metadata.id = "#{dataset_name.downcase}-#{type}"
29
29
  @metadata.name = "#{dataset_name}: #{type}"
30
30
  @metadata.url = self.class::BASE_URL
31
+ @metadata.licenses = licenses
31
32
  @type = type
32
33
 
33
34
  case type
@@ -45,18 +46,17 @@ module Datasets
45
46
  label_path = cache_dir_path + target_file(:label)
46
47
  base_url = self.class::BASE_URL
47
48
 
48
- unless image_path.exist?
49
- download(image_path, base_url + target_file(:image))
50
- end
51
-
52
- unless label_path.exist?
53
- download(label_path, base_url + target_file(:label))
54
- end
49
+ download(image_path, base_url + target_file(:image))
50
+ download(label_path, base_url + target_file(:label))
55
51
 
56
52
  open_data(image_path, label_path, &block)
57
53
  end
58
54
 
59
55
  private
56
+ def licenses
57
+ []
58
+ end
59
+
60
60
  def open_data(image_path, label_path, &block)
61
61
  labels = parse_labels(label_path)
62
62
 
@@ -35,6 +35,7 @@ module Datasets
35
35
  @metadata.id = "mushroom"
36
36
  @metadata.name = "Mushroom"
37
37
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.licenses = ["CC-BY-4.0"]
38
39
  @metadata.description = lambda do
39
40
  read_names
40
41
  end
@@ -58,10 +59,8 @@ module Datasets
58
59
  private
59
60
  def open_data
60
61
  data_path = cache_dir_path + "agaricus-lepiota.data"
61
- unless data_path.exist?
62
- data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
- download(data_path, data_url)
64
- end
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
65
64
  CSV.open(data_path) do |csv|
66
65
  yield(csv)
67
66
  end
@@ -69,10 +68,8 @@ module Datasets
69
68
 
70
69
  def read_names
71
70
  names_path = cache_dir_path + "agaricus-lepiota.names"
72
- unless names_path.exist?
73
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
- download(names_path, names_url)
75
- end
71
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
72
+ download(names_path, names_url)
76
73
  names_path.read
77
74
  end
78
75
 
@@ -0,0 +1,109 @@
1
+ require_relative 'dataset'
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ class NagoyaUniversityConversationCorpus < Dataset
6
+ Data = Struct.new(
7
+ :name,
8
+ :date,
9
+ :place,
10
+ :participants,
11
+ :relationships,
12
+ :note,
13
+ :sentences
14
+ )
15
+
16
+ Participant = Struct.new(
17
+ :id,
18
+ :attribute,
19
+ :birthplace,
20
+ :residence
21
+ )
22
+
23
+ Sentence = Struct.new(:participant_id, :content) do
24
+ def end?
25
+ participant_id.nil? and content.nil?
26
+ end
27
+ end
28
+
29
+ def initialize
30
+ super()
31
+ @metadata.id = 'nagoya-university-conversation-curpus'
32
+ @metadata.name = 'Nagoya University Conversation Curpus'
33
+ @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
34
+ @metadata.licenses = ['CC-BY-NC-ND-4.0']
35
+ @metadata.description = <<~DESCRIPTION
36
+ The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
37
+ total about 100 hours of chatting among native speakers of Japanese,
38
+ which is converted into text.
39
+ DESCRIPTION
40
+ end
41
+
42
+ def each
43
+ return to_enum(__method__) unless block_given?
44
+
45
+ open_data do |input_stream|
46
+ yield(parse_file(input_stream))
47
+ end
48
+ end
49
+
50
+ private
51
+
52
+ def open_data
53
+ data_path = cache_dir_path + 'nucc.zip'
54
+ data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
55
+ download(data_path, data_url)
56
+
57
+ extractor = ZipExtractor.new(data_path)
58
+ extractor.extract_files do |input_stream|
59
+ yield(input_stream)
60
+ end
61
+ end
62
+
63
+ def parse_file(input_stream)
64
+ data = Data.new
65
+ participants = []
66
+ sentences = []
67
+
68
+ input_stream.each do |input|
69
+ input.each_line(chomp: true) do |line|
70
+ line.force_encoding('utf-8')
71
+ if line.start_with?('@データ')
72
+ data.name = line[4..]
73
+ elsif line.start_with?('@収集年月日')
74
+ # mixed cases with and without':'
75
+ data.date = line[6..].delete_prefix(':')
76
+ elsif line.start_with?('@場所')
77
+ data.place = line[4..]
78
+ elsif line.start_with?('@参加者の関係')
79
+ data.relationships = line.split(':', 2)[1]
80
+ elsif line.start_with?('@参加者')
81
+ participant = Participant.new
82
+ participant.id, profiles = line[4..].split(':', 2)
83
+ participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
84
+
85
+ participants << participant
86
+ elsif line.start_with?('%com')
87
+ data.note = line.split(':', 2)[1]
88
+ elsif line == '@END'
89
+ sentence = Sentence.new
90
+ sentence.participant_id = nil
91
+ sentence.content = nil
92
+
93
+ sentences << sentence
94
+ else
95
+ sentence = Sentence.new
96
+ sentence.participant_id, sentence.content = line.split(':', 2)
97
+
98
+ sentences << sentence
99
+ end
100
+ end
101
+ end
102
+
103
+ data.participants = participants
104
+ data.sentences = sentences
105
+
106
+ data
107
+ end
108
+ end
109
+ end
@@ -1,3 +1,5 @@
1
+ require "csv"
2
+
1
3
  require_relative "dataset"
2
4
 
3
5
  module Datasets
@@ -23,10 +25,10 @@ module Datasets
23
25
  def initialize
24
26
  super
25
27
  species = self.class.name.split("::").last.downcase
26
- @metadata.id = "palmerpenguins-raw-#{species}"
28
+ @metadata.id = "palmerpenguins-#{species}"
27
29
  @metadata.url = self.class::URL
28
- @metadata.licenses = ["CC0"]
29
- @data_path = cache_dir_path + "penguins" + (species + ".csv")
30
+ @metadata.licenses = ["CC0-1.0"]
31
+ @data_path = cache_dir_path + "#{species}.csv"
30
32
  end
31
33
 
32
34
  attr_reader :data_path
@@ -44,15 +46,11 @@ module Datasets
44
46
  end
45
47
 
46
48
  private def open_data
47
- download unless data_path.exist?
49
+ download(data_path, metadata.url)
48
50
  CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
49
51
  yield csv
50
52
  end
51
53
  end
52
-
53
- private def download
54
- super(data_path, metadata.url)
55
- end
56
54
  end
57
55
 
58
56
  # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
@@ -36,10 +36,8 @@ module Datasets
36
36
 
37
37
  base_name = "ptb.#{@type}.txt"
38
38
  data_path = cache_dir_path + base_name
39
- unless data_path.exist?
40
- base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
41
- download(data_path, "#{base_url}/#{base_name}")
42
- end
39
+ base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
40
+ download(data_path, "#{base_url}/#{base_name}")
43
41
 
44
42
  parse_data(data_path, &block)
45
43
  end
@@ -0,0 +1,67 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ class PMJTDatasetList < Dataset
5
+ Record = Struct.new(:unit,
6
+ :open_data_category,
7
+ :tag,
8
+ :release_time,
9
+ :n_volumes,
10
+ :type,
11
+ :publication_year,
12
+ :original_request_code,
13
+ :id,
14
+ :title,
15
+ :text,
16
+ :bibliographical_introduction,
17
+ :year)
18
+
19
+ def initialize
20
+ super()
21
+ @metadata.id = "pmjt-dataset-list"
22
+ @metadata.name = "List of pre-modern Japanese text dataset"
23
+ @metadata.url = "http://codh.rois.ac.jp/pmjt/"
24
+ @metadata.licenses = ["CC-BY-SA-4.0"]
25
+ @metadata.description = <<~DESCRIPTION
26
+ Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data.
27
+ In addition, some text has description, transcription, and tagging data.
28
+ DESCRIPTION
29
+
30
+ @data_path = cache_dir_path + (@metadata.id + ".csv")
31
+ end
32
+
33
+ def each(&block)
34
+ return to_enum(__method__) unless block_given?
35
+
36
+ latest_version = "201901"
37
+ url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv"
38
+ download(@data_path, url)
39
+ CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv|
40
+ csv.each do |row|
41
+ record = create_record(row)
42
+ yield record
43
+ end
44
+ end
45
+ end
46
+
47
+ private
48
+ def create_record(csv_row)
49
+ record = Record.new
50
+ record.unit = csv_row["(単位)"]
51
+ record.open_data_category = csv_row["オープンデータ分類"]
52
+ record.tag = csv_row["タグ"]
53
+ record.release_time = csv_row["公開時期"]
54
+ record.n_volumes = csv_row["冊数等"]
55
+ record.type = csv_row["刊・写"]
56
+ record.publication_year = csv_row["刊年・書写年"]
57
+ record.original_request_code = csv_row["原本請求記号"]
58
+ record.id = csv_row["国文研書誌ID"]
59
+ record.title = csv_row["書名(統一書名)"]
60
+ record.text = csv_row["本文"]
61
+ record.bibliographical_introduction = csv_row["解題"]
62
+ record.year = csv_row["(西暦)"]
63
+
64
+ record
65
+ end
66
+ end
67
+ end
@@ -49,9 +49,7 @@ module Datasets
49
49
  @metadata.id = "postal-code-japan-#{@reading}"
50
50
  @metadata.name = "Postal code in Japan (#{@reading})"
51
51
  @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
- @metadata.licenses = [
53
- "CC0-1.0",
54
- ]
52
+ @metadata.licenses = ["CC0-1.0"]
55
53
  @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
54
  end
57
55
 
@@ -116,9 +114,7 @@ module Datasets
116
114
  data_url << "/roman/ken_all_rome.zip"
117
115
  end
118
116
  data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
- unless data_path.exist?
120
- download(data_path, data_url)
121
- end
117
+ download(data_path, data_url)
122
118
 
123
119
  Zip::File.open(data_path.to_s) do |zip_file|
124
120
  zip_file.each do |entry|
@@ -0,0 +1,51 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class QuoraDuplicateQuestionPair < Dataset
7
+ class Record < Struct.new(:id,
8
+ :first_question_id,
9
+ :second_question_id,
10
+ :first_question,
11
+ :second_question,
12
+ :duplicated)
13
+ alias_method :duplicated?, :duplicated
14
+ end
15
+
16
+ def initialize
17
+ super()
18
+ @metadata.id = "quora-duplicate-question-pair"
19
+ @metadata.name = "Quora's duplicated question pair dataset"
20
+ @metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs"
21
+ @metadata.licenses = [
22
+ {
23
+ name: "Quora's Terms of Service",
24
+ url: "https://www.quora.com/about/tos",
25
+ }
26
+ ]
27
+ end
28
+
29
+ def each
30
+ return to_enum(__method__) unless block_given?
31
+
32
+ open_data do |csv|
33
+ csv.each do |row|
34
+ row["is_duplicate"] = (row["is_duplicate"] == 1)
35
+ record = Record.new(*row.fields)
36
+ yield(record)
37
+ end
38
+ end
39
+ end
40
+
41
+ private
42
+ def open_data
43
+ data_path = cache_dir_path + "quora_duplicate_questions.tsv"
44
+ data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
45
+ download(data_path, data_url)
46
+ CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
47
+ yield(csv)
48
+ end
49
+ end
50
+ end
51
+ end