red-datasets 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,90 @@
1
+ require_relative "version"
2
+
3
+ module Datasets
4
+ class LazyLoader
5
+ def initialize
6
+ @constants = {}
7
+ end
8
+
9
+ def exist?(constant_name)
10
+ @constants.key?(constant_name)
11
+ end
12
+
13
+ def load(constant_name)
14
+ feature = @constants[constant_name]
15
+ raise LoadError, "unknown dataset: #{constant_name}" unless feature
16
+ require feature
17
+ end
18
+
19
+ def load_all
20
+ @constants.each_value do |feature|
21
+ require feature
22
+ end
23
+ end
24
+
25
+ def register(constant_name, feature)
26
+ @constants[constant_name] = feature
27
+ end
28
+
29
+ def constant_names
30
+ @constants.keys
31
+ end
32
+ end
33
+
34
+ LAZY_LOADER = LazyLoader.new
35
+
36
+ class << self
37
+ def const_missing(name)
38
+ if LAZY_LOADER.exist?(name)
39
+ LAZY_LOADER.load(name)
40
+ const_get(name)
41
+ else
42
+ super
43
+ end
44
+ end
45
+ end
46
+
47
+ LAZY_LOADER.register(:Adult, "datasets/adult")
48
+ LAZY_LOADER.register(:AFINN, "datasets/afinn")
49
+ LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
50
+ LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
51
+ LAZY_LOADER.register(:CIFAR, "datasets/cifar")
52
+ LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
53
+ LAZY_LOADER.register(:Communities, "datasets/communities")
54
+ LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
55
+ LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
56
+ LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
57
+ LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
58
+ LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
59
+ LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
60
+ LAZY_LOADER.register(:Iris, "datasets/iris")
61
+ LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
62
+ LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
63
+ LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
64
+ LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
65
+ LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
66
+ LAZY_LOADER.register(:MNIST, "datasets/mnist")
67
+ LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
68
+ LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
69
+ "datasets/nagoya-university-conversation-corpus")
70
+ LAZY_LOADER.register(:Penguins, "datasets/penguins")
71
+ LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
72
+ LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
73
+ LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
74
+ LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
75
+ "datasets/quora-duplicate-question-pair")
76
+ LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
77
+ # For backward compatibility
78
+ LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
79
+ LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
80
+ # For backward compatibility
81
+ LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
82
+ LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
83
+ LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
84
+ LAZY_LOADER.register(:SudachiSynonymDictionary,
85
+ "datasets/sudachi-synonym-dictionary")
86
+ LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
87
+ LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
88
+ "datasets/wikipedia-kyoto-japanese-english")
89
+ LAZY_LOADER.register(:Wine, "datasets/wine")
90
+ end
@@ -28,6 +28,7 @@ module Datasets
28
28
  @metadata.id = "libsvm-dataset-list"
29
29
  @metadata.name = "LIBSVM dataset list"
30
30
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
31
+ @metadata.licenses = ["BSD-3-Clause"]
31
32
  @metadata.description = lambda do
32
33
  extract_description
33
34
  end
@@ -51,10 +52,8 @@ module Datasets
51
52
  private
52
53
  def open_data
53
54
  data_path = cache_dir_path + "index.html"
54
- unless data_path.exist?
55
- download(data_path, @metadata.url)
56
- end
57
- ::File.open(data_path) do |input|
55
+ download(data_path, @metadata.url)
56
+ data_path.open do |input|
58
57
  yield(input)
59
58
  end
60
59
  end
@@ -78,10 +77,8 @@ module Datasets
78
77
 
79
78
  def open_detail(detail)
80
79
  data_path = cache_dir_path + detail
81
- unless data_path.exist?
82
- download(data_path, @metadata.url + detail)
83
- end
84
- ::File.open(data_path) do |input|
80
+ download(data_path, @metadata.url + detail)
81
+ data_path.open do |input|
85
82
  yield(input)
86
83
  end
87
84
  end
@@ -41,6 +41,7 @@ module Datasets
41
41
  @metadata.id = "libsvm-#{normalize_name(name)}"
42
42
  @metadata.name = "LIBSVM dataset: #{name}"
43
43
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ @metadata.licenses = ["BSD-3-Clause"]
44
45
  end
45
46
 
46
47
  def each
@@ -99,13 +100,11 @@ module Datasets
99
100
 
100
101
  def open_data(&block)
101
102
  data_path = cache_dir_path + @file.name
102
- unless data_path.exist?
103
- download(data_path, @file.url)
104
- end
103
+ download(data_path, @file.url)
105
104
  if data_path.extname == ".bz2"
106
105
  extract_bz2(data_path, &block)
107
106
  else
108
- File.open(data_path, &block)
107
+ data_path.open(&block)
109
108
  end
110
109
  end
111
110
 
@@ -0,0 +1,26 @@
1
+ module Datasets
2
+ class License < Struct.new(:spdx_id,
3
+ :name,
4
+ :url)
5
+ class << self
6
+ def try_convert(value)
7
+ case value
8
+ when self
9
+ value
10
+ when String
11
+ license = new
12
+ license.spdx_id = value
13
+ license
14
+ when Hash
15
+ license = new
16
+ license.spdx_id = value[:spdx_id]
17
+ license.name = value[:name]
18
+ license.url = value[:url]
19
+ license
20
+ else
21
+ nil
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,80 @@
1
+ require_relative "dataset"
2
+ require_relative "tar-gz-readable"
3
+
4
+ module Datasets
5
+ class LivedoorNews < Dataset
6
+ include TarGzReadable
7
+ Record = Struct.new(:url,
8
+ :timestamp,
9
+ :sentence)
10
+
11
+ def initialize(type: :topic_news)
12
+ news_list = [
13
+ :topic_news,
14
+ :sports_watch,
15
+ :it_life_hack,
16
+ :kaden_channel,
17
+ :movie_enter,
18
+ :dokujo_tsushin,
19
+ :smax,
20
+ :livedoor_homme,
21
+ :peachy
22
+ ]
23
+ unless news_list.include?(type)
24
+ valid_type_labels = news_list.collect(&:inspect).join(", ")
25
+ message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
26
+ raise ArgumentError, message
27
+ end
28
+
29
+ super()
30
+ @type = type
31
+ @metadata.id = 'livedoor-news'
32
+ @metadata.name = 'livedoor-news'
33
+ @metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
34
+ @metadata.licenses = ['CC-BY-ND-2.1-JP']
35
+ @metadata.description = lambda do
36
+ fetch_readme
37
+ end
38
+ end
39
+
40
+ def each(&block)
41
+ return to_enum(__method__) unless block_given?
42
+
43
+ data_path = download_tar_gz
44
+ parse_data(data_path, &block)
45
+ end
46
+
47
+ private
48
+ def download_tar_gz
49
+ data_path = cache_dir_path + "livedoor-news.tar.gz"
50
+ data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
51
+ download(data_path, data_url)
52
+ data_path
53
+ end
54
+
55
+ def fetch_readme
56
+ data_path = download_tar_gz
57
+ target_file_name = 'text/README.txt'
58
+ open_tar_gz(data_path) do |tar|
59
+ tar.seek(target_file_name) do |entry|
60
+ return entry.read.force_encoding("UTF-8")
61
+ end
62
+ end
63
+ end
64
+
65
+ def parse_data(data_path, &block)
66
+ target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
67
+ open_tar_gz(data_path) do |tar|
68
+ tar.each do |entry|
69
+ next unless entry.file?
70
+ directory_name, base_name = File.split(entry.full_name)
71
+ next unless directory_name == target_directory_name
72
+ next if base_name == "LICENSE.txt"
73
+ url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
74
+ record = Record.new(url, Time.iso8601(timestamp), sentence)
75
+ yield(record)
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -1,9 +1,23 @@
1
+ require_relative "license"
2
+
1
3
  module Datasets
2
4
  class Metadata < Struct.new(:id,
3
5
  :name,
4
6
  :url,
5
7
  :licenses,
6
8
  :description)
9
+ def licenses=(licenses)
10
+ licenses = [licenses] unless licenses.is_a?(Array)
11
+ licenses = licenses.collect do |license|
12
+ l = License.try_convert(license)
13
+ if l.nil?
14
+ raise ArgumentError.new("invalid license: #{license.inspect}")
15
+ end
16
+ l
17
+ end
18
+ super(licenses)
19
+ end
20
+
7
21
  def description
8
22
  description_raw = super
9
23
  if description_raw.respond_to?(:call)
@@ -28,6 +28,7 @@ module Datasets
28
28
  @metadata.id = "#{dataset_name.downcase}-#{type}"
29
29
  @metadata.name = "#{dataset_name}: #{type}"
30
30
  @metadata.url = self.class::BASE_URL
31
+ @metadata.licenses = licenses
31
32
  @type = type
32
33
 
33
34
  case type
@@ -45,18 +46,17 @@ module Datasets
45
46
  label_path = cache_dir_path + target_file(:label)
46
47
  base_url = self.class::BASE_URL
47
48
 
48
- unless image_path.exist?
49
- download(image_path, base_url + target_file(:image))
50
- end
51
-
52
- unless label_path.exist?
53
- download(label_path, base_url + target_file(:label))
54
- end
49
+ download(image_path, base_url + target_file(:image))
50
+ download(label_path, base_url + target_file(:label))
55
51
 
56
52
  open_data(image_path, label_path, &block)
57
53
  end
58
54
 
59
55
  private
56
+ def licenses
57
+ []
58
+ end
59
+
60
60
  def open_data(image_path, label_path, &block)
61
61
  labels = parse_labels(label_path)
62
62
 
@@ -35,6 +35,7 @@ module Datasets
35
35
  @metadata.id = "mushroom"
36
36
  @metadata.name = "Mushroom"
37
37
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.licenses = ["CC-BY-4.0"]
38
39
  @metadata.description = lambda do
39
40
  read_names
40
41
  end
@@ -58,10 +59,8 @@ module Datasets
58
59
  private
59
60
  def open_data
60
61
  data_path = cache_dir_path + "agaricus-lepiota.data"
61
- unless data_path.exist?
62
- data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
- download(data_path, data_url)
64
- end
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
65
64
  CSV.open(data_path) do |csv|
66
65
  yield(csv)
67
66
  end
@@ -69,10 +68,8 @@ module Datasets
69
68
 
70
69
  def read_names
71
70
  names_path = cache_dir_path + "agaricus-lepiota.names"
72
- unless names_path.exist?
73
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
- download(names_path, names_url)
75
- end
71
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
72
+ download(names_path, names_url)
76
73
  names_path.read
77
74
  end
78
75
 
@@ -0,0 +1,109 @@
1
+ require_relative 'dataset'
2
+ require_relative 'zip-extractor'
3
+
4
+ module Datasets
5
+ class NagoyaUniversityConversationCorpus < Dataset
6
+ Data = Struct.new(
7
+ :name,
8
+ :date,
9
+ :place,
10
+ :participants,
11
+ :relationships,
12
+ :note,
13
+ :sentences
14
+ )
15
+
16
+ Participant = Struct.new(
17
+ :id,
18
+ :attribute,
19
+ :birthplace,
20
+ :residence
21
+ )
22
+
23
+ Sentence = Struct.new(:participant_id, :content) do
24
+ def end?
25
+ participant_id.nil? and content.nil?
26
+ end
27
+ end
28
+
29
+ def initialize
30
+ super()
31
+ @metadata.id = 'nagoya-university-conversation-curpus'
32
+ @metadata.name = 'Nagoya University Conversation Curpus'
33
+ @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
34
+ @metadata.licenses = ['CC-BY-NC-ND-4.0']
35
+ @metadata.description = <<~DESCRIPTION
36
+ The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
37
+ total about 100 hours of chatting among native speakers of Japanese,
38
+ which is converted into text.
39
+ DESCRIPTION
40
+ end
41
+
42
+ def each
43
+ return to_enum(__method__) unless block_given?
44
+
45
+ open_data do |input_stream|
46
+ yield(parse_file(input_stream))
47
+ end
48
+ end
49
+
50
+ private
51
+
52
+ def open_data
53
+ data_path = cache_dir_path + 'nucc.zip'
54
+ data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
55
+ download(data_path, data_url)
56
+
57
+ extractor = ZipExtractor.new(data_path)
58
+ extractor.extract_files do |input_stream|
59
+ yield(input_stream)
60
+ end
61
+ end
62
+
63
+ def parse_file(input_stream)
64
+ data = Data.new
65
+ participants = []
66
+ sentences = []
67
+
68
+ input_stream.each do |input|
69
+ input.each_line(chomp: true) do |line|
70
+ line.force_encoding('utf-8')
71
+ if line.start_with?('@データ')
72
+ data.name = line[4..]
73
+ elsif line.start_with?('@収集年月日')
74
+ # mixed cases with and without':'
75
+ data.date = line[6..].delete_prefix(':')
76
+ elsif line.start_with?('@場所')
77
+ data.place = line[4..]
78
+ elsif line.start_with?('@参加者の関係')
79
+ data.relationships = line.split(':', 2)[1]
80
+ elsif line.start_with?('@参加者')
81
+ participant = Participant.new
82
+ participant.id, profiles = line[4..].split(':', 2)
83
+ participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
84
+
85
+ participants << participant
86
+ elsif line.start_with?('%com')
87
+ data.note = line.split(':', 2)[1]
88
+ elsif line == '@END'
89
+ sentence = Sentence.new
90
+ sentence.participant_id = nil
91
+ sentence.content = nil
92
+
93
+ sentences << sentence
94
+ else
95
+ sentence = Sentence.new
96
+ sentence.participant_id, sentence.content = line.split(':', 2)
97
+
98
+ sentences << sentence
99
+ end
100
+ end
101
+ end
102
+
103
+ data.participants = participants
104
+ data.sentences = sentences
105
+
106
+ data
107
+ end
108
+ end
109
+ end
@@ -1,3 +1,5 @@
1
+ require "csv"
2
+
1
3
  require_relative "dataset"
2
4
 
3
5
  module Datasets
@@ -23,10 +25,10 @@ module Datasets
23
25
  def initialize
24
26
  super
25
27
  species = self.class.name.split("::").last.downcase
26
- @metadata.id = "palmerpenguins-raw-#{species}"
28
+ @metadata.id = "palmerpenguins-#{species}"
27
29
  @metadata.url = self.class::URL
28
- @metadata.licenses = ["CC0"]
29
- @data_path = cache_dir_path + "penguins" + (species + ".csv")
30
+ @metadata.licenses = ["CC0-1.0"]
31
+ @data_path = cache_dir_path + "#{species}.csv"
30
32
  end
31
33
 
32
34
  attr_reader :data_path
@@ -44,15 +46,11 @@ module Datasets
44
46
  end
45
47
 
46
48
  private def open_data
47
- download unless data_path.exist?
49
+ download(data_path, metadata.url)
48
50
  CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
49
51
  yield csv
50
52
  end
51
53
  end
52
-
53
- private def download
54
- super(data_path, metadata.url)
55
- end
56
54
  end
57
55
 
58
56
  # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
@@ -36,10 +36,8 @@ module Datasets
36
36
 
37
37
  base_name = "ptb.#{@type}.txt"
38
38
  data_path = cache_dir_path + base_name
39
- unless data_path.exist?
40
- base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
41
- download(data_path, "#{base_url}/#{base_name}")
42
- end
39
+ base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
40
+ download(data_path, "#{base_url}/#{base_name}")
43
41
 
44
42
  parse_data(data_path, &block)
45
43
  end
@@ -0,0 +1,67 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ class PMJTDatasetList < Dataset
5
+ Record = Struct.new(:unit,
6
+ :open_data_category,
7
+ :tag,
8
+ :release_time,
9
+ :n_volumes,
10
+ :type,
11
+ :publication_year,
12
+ :original_request_code,
13
+ :id,
14
+ :title,
15
+ :text,
16
+ :bibliographical_introduction,
17
+ :year)
18
+
19
+ def initialize
20
+ super()
21
+ @metadata.id = "pmjt-dataset-list"
22
+ @metadata.name = "List of pre-modern Japanese text dataset"
23
+ @metadata.url = "http://codh.rois.ac.jp/pmjt/"
24
+ @metadata.licenses = ["CC-BY-SA-4.0"]
25
+ @metadata.description = <<~DESCRIPTION
26
+ Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data.
27
+ In addition, some text has description, transcription, and tagging data.
28
+ DESCRIPTION
29
+
30
+ @data_path = cache_dir_path + (@metadata.id + ".csv")
31
+ end
32
+
33
+ def each(&block)
34
+ return to_enum(__method__) unless block_given?
35
+
36
+ latest_version = "201901"
37
+ url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv"
38
+ download(@data_path, url)
39
+ CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv|
40
+ csv.each do |row|
41
+ record = create_record(row)
42
+ yield record
43
+ end
44
+ end
45
+ end
46
+
47
+ private
48
+ def create_record(csv_row)
49
+ record = Record.new
50
+ record.unit = csv_row["(単位)"]
51
+ record.open_data_category = csv_row["オープンデータ分類"]
52
+ record.tag = csv_row["タグ"]
53
+ record.release_time = csv_row["公開時期"]
54
+ record.n_volumes = csv_row["冊数等"]
55
+ record.type = csv_row["刊・写"]
56
+ record.publication_year = csv_row["刊年・書写年"]
57
+ record.original_request_code = csv_row["原本請求記号"]
58
+ record.id = csv_row["国文研書誌ID"]
59
+ record.title = csv_row["書名(統一書名)"]
60
+ record.text = csv_row["本文"]
61
+ record.bibliographical_introduction = csv_row["解題"]
62
+ record.year = csv_row["(西暦)"]
63
+
64
+ record
65
+ end
66
+ end
67
+ end
@@ -49,9 +49,7 @@ module Datasets
49
49
  @metadata.id = "postal-code-japan-#{@reading}"
50
50
  @metadata.name = "Postal code in Japan (#{@reading})"
51
51
  @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
- @metadata.licenses = [
53
- "CC0-1.0",
54
- ]
52
+ @metadata.licenses = ["CC0-1.0"]
55
53
  @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
54
  end
57
55
 
@@ -116,9 +114,7 @@ module Datasets
116
114
  data_url << "/roman/ken_all_rome.zip"
117
115
  end
118
116
  data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
- unless data_path.exist?
120
- download(data_path, data_url)
121
- end
117
+ download(data_path, data_url)
122
118
 
123
119
  Zip::File.open(data_path.to_s) do |zip_file|
124
120
  zip_file.each do |entry|
@@ -0,0 +1,51 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class QuoraDuplicateQuestionPair < Dataset
7
+ class Record < Struct.new(:id,
8
+ :first_question_id,
9
+ :second_question_id,
10
+ :first_question,
11
+ :second_question,
12
+ :duplicated)
13
+ alias_method :duplicated?, :duplicated
14
+ end
15
+
16
+ def initialize
17
+ super()
18
+ @metadata.id = "quora-duplicate-question-pair"
19
+ @metadata.name = "Quora's duplicated question pair dataset"
20
+ @metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs"
21
+ @metadata.licenses = [
22
+ {
23
+ name: "Quora's Terms of Service",
24
+ url: "https://www.quora.com/about/tos",
25
+ }
26
+ ]
27
+ end
28
+
29
+ def each
30
+ return to_enum(__method__) unless block_given?
31
+
32
+ open_data do |csv|
33
+ csv.each do |row|
34
+ row["is_duplicate"] = (row["is_duplicate"] == 1)
35
+ record = Record.new(*row.fields)
36
+ yield(record)
37
+ end
38
+ end
39
+ end
40
+
41
+ private
42
+ def open_data
43
+ data_path = cache_dir_path + "quora_duplicate_questions.tsv"
44
+ data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
45
+ download(data_path, data_url)
46
+ CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
47
+ yield(csv)
48
+ end
49
+ end
50
+ end
51
+ end