red-datasets 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -2
  3. data/doc/text/news.md +86 -0
  4. data/lib/datasets/adult.rb +6 -9
  5. data/lib/datasets/afinn.rb +48 -0
  6. data/lib/datasets/aozora-bunko.rb +196 -0
  7. data/lib/datasets/cache-path.rb +28 -0
  8. data/lib/datasets/california-housing.rb +60 -0
  9. data/lib/datasets/cifar.rb +2 -4
  10. data/lib/datasets/cldr-plurals.rb +2 -4
  11. data/lib/datasets/communities.rb +5 -8
  12. data/lib/datasets/dataset.rb +8 -12
  13. data/lib/datasets/diamonds.rb +26 -0
  14. data/lib/datasets/downloader.rb +6 -1
  15. data/lib/datasets/e-stat-japan.rb +2 -1
  16. data/lib/datasets/fashion-mnist.rb +4 -0
  17. data/lib/datasets/fuel-economy.rb +35 -0
  18. data/lib/datasets/geolonia.rb +67 -0
  19. data/lib/datasets/ggplot2-dataset.rb +79 -0
  20. data/lib/datasets/hepatitis.rb +5 -8
  21. data/lib/datasets/iris.rb +5 -8
  22. data/lib/datasets/ita-corpus.rb +57 -0
  23. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  24. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  25. data/lib/datasets/libsvm.rb +3 -4
  26. data/lib/datasets/license.rb +26 -0
  27. data/lib/datasets/livedoor-news.rb +80 -0
  28. data/lib/datasets/metadata.rb +14 -0
  29. data/lib/datasets/mnist.rb +7 -7
  30. data/lib/datasets/mushroom.rb +5 -8
  31. data/lib/datasets/penguins.rb +4 -8
  32. data/lib/datasets/penn-treebank.rb +2 -4
  33. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  34. data/lib/datasets/postal-code-japan.rb +2 -6
  35. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  36. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  37. data/lib/datasets/seaborn.rb +90 -0
  38. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  39. data/lib/datasets/version.rb +1 -1
  40. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  41. data/lib/datasets/wikipedia.rb +4 -5
  42. data/lib/datasets/wine.rb +6 -9
  43. data/lib/datasets/zip-extractor.rb +36 -0
  44. data/lib/datasets.rb +14 -2
  45. data/red-datasets.gemspec +1 -1
  46. data/test/helper.rb +21 -0
  47. data/test/test-afinn.rb +60 -0
  48. data/test/test-aozora-bunko.rb +190 -0
  49. data/test/test-california-housing.rb +56 -0
  50. data/test/test-cldr-plurals.rb +1 -1
  51. data/test/test-dataset.rb +15 -7
  52. data/test/test-diamonds.rb +71 -0
  53. data/test/test-fuel-economy.rb +75 -0
  54. data/test/test-geolonia.rb +64 -0
  55. data/test/test-ita-corpus.rb +69 -0
  56. data/test/test-kuzushiji-mnist.rb +137 -0
  57. data/test/test-license.rb +24 -0
  58. data/test/test-livedoor-news.rb +351 -0
  59. data/test/test-metadata.rb +36 -0
  60. data/test/test-penguins.rb +1 -1
  61. data/test/test-pmjt-dataset-list.rb +50 -0
  62. data/test/test-quora-duplicate-question-pair.rb +33 -0
  63. data/test/test-rdataset.rb +246 -0
  64. data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
  65. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  66. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  67. metadata +58 -14
  68. data/lib/datasets/seaborn-data.rb +0 -49
  69. data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,67 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ class PMJTDatasetList < Dataset
5
+ Record = Struct.new(:unit,
6
+ :open_data_category,
7
+ :tag,
8
+ :release_time,
9
+ :n_volumes,
10
+ :type,
11
+ :publication_year,
12
+ :original_request_code,
13
+ :id,
14
+ :title,
15
+ :text,
16
+ :bibliographical_introduction,
17
+ :year)
18
+
19
+ def initialize
20
+ super()
21
+ @metadata.id = "pmjt-dataset-list"
22
+ @metadata.name = "List of pre-modern Japanese text dataset"
23
+ @metadata.url = "http://codh.rois.ac.jp/pmjt/"
24
+ @metadata.licenses = ["CC-BY-SA-4.0"]
25
+ @metadata.description = <<~DESCRIPTION
26
+ Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data.
27
+ In addition, some text has description, transcription, and tagging data.
28
+ DESCRIPTION
29
+
30
+ @data_path = cache_dir_path + (@metadata.id + ".csv")
31
+ end
32
+
33
+ def each(&block)
34
+ return to_enum(__method__) unless block_given?
35
+
36
+ latest_version = "201901"
37
+ url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv"
38
+ download(@data_path, url)
39
+ CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv|
40
+ csv.each do |row|
41
+ record = create_record(row)
42
+ yield record
43
+ end
44
+ end
45
+ end
46
+
47
+ private
48
+ def create_record(csv_row)
49
+ record = Record.new
50
+ record.unit = csv_row["(単位)"]
51
+ record.open_data_category = csv_row["オープンデータ分類"]
52
+ record.tag = csv_row["タグ"]
53
+ record.release_time = csv_row["公開時期"]
54
+ record.n_volumes = csv_row["冊数等"]
55
+ record.type = csv_row["刊・写"]
56
+ record.publication_year = csv_row["刊年・書写年"]
57
+ record.original_request_code = csv_row["原本請求記号"]
58
+ record.id = csv_row["国文研書誌ID"]
59
+ record.title = csv_row["書名(統一書名)"]
60
+ record.text = csv_row["本文"]
61
+ record.bibliographical_introduction = csv_row["解題"]
62
+ record.year = csv_row["(西暦)"]
63
+
64
+ record
65
+ end
66
+ end
67
+ end
@@ -49,9 +49,7 @@ module Datasets
49
49
  @metadata.id = "postal-code-japan-#{@reading}"
50
50
  @metadata.name = "Postal code in Japan (#{@reading})"
51
51
  @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
- @metadata.licenses = [
53
- "CC0-1.0",
54
- ]
52
+ @metadata.licenses = ["CC0-1.0"]
55
53
  @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
54
  end
57
55
 
@@ -116,9 +114,7 @@ module Datasets
116
114
  data_url << "/roman/ken_all_rome.zip"
117
115
  end
118
116
  data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
- unless data_path.exist?
120
- download(data_path, data_url)
121
- end
117
+ download(data_path, data_url)
122
118
 
123
119
  Zip::File.open(data_path.to_s) do |zip_file|
124
120
  zip_file.each do |entry|
@@ -0,0 +1,51 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class QuoraDuplicateQuestionPair < Dataset
7
+ class Record < Struct.new(:id,
8
+ :first_question_id,
9
+ :second_question_id,
10
+ :first_question,
11
+ :second_question,
12
+ :duplicated)
13
+ alias_method :duplicated?, :duplicated
14
+ end
15
+
16
+ def initialize
17
+ super()
18
+ @metadata.id = "quora-duplicate-question-pair"
19
+ @metadata.name = "Quora's duplicated question pair dataset"
20
+ @metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs"
21
+ @metadata.licenses = [
22
+ {
23
+ name: "Quora's Terms of Service",
24
+ url: "https://www.quora.com/about/tos",
25
+ }
26
+ ]
27
+ end
28
+
29
+ def each
30
+ return to_enum(__method__) unless block_given?
31
+
32
+ open_data do |csv|
33
+ csv.each do |row|
34
+ row["is_duplicate"] = (row["is_duplicate"] == 1)
35
+ record = Record.new(*row.fields)
36
+ yield(record)
37
+ end
38
+ end
39
+ end
40
+
41
+ private
42
+ def open_data
43
+ data_path = cache_dir_path + "quora_duplicate_questions.tsv"
44
+ data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
45
+ download(data_path, data_url)
46
+ CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
47
+ yield(csv)
48
+ end
49
+ end
50
+ end
51
+ end
@@ -2,7 +2,7 @@ require_relative "dataset"
2
2
  require_relative "tar-gz-readable"
3
3
 
4
4
  module Datasets
5
- class RdatasetsList < Dataset
5
+ class RdatasetList < Dataset
6
6
  Record = Struct.new(:package,
7
7
  :dataset,
8
8
  :title,
@@ -18,8 +18,8 @@ module Datasets
18
18
 
19
19
  def initialize
20
20
  super
21
- @metadata.id = "rdatasets"
22
- @metadata.name = "Rdatasets"
21
+ @metadata.id = "rdataset-list"
22
+ @metadata.name = "Rdataset"
23
23
  @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
24
24
  @metadata.licenses = ["GPL-3"]
25
25
  @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
@@ -48,16 +48,19 @@ module Datasets
48
48
  end
49
49
 
50
50
  private def each_row(&block)
51
- download(@data_path, @data_url) unless @data_path.exist?
51
+ download(@data_path, @data_url)
52
52
  CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
53
53
  csv.each(&block)
54
54
  end
55
55
  end
56
56
  end
57
57
 
58
- class Rdatasets < Dataset
58
+ # For backward compatibility
59
+ RdatasetsList = RdatasetList
60
+
61
+ class Rdataset < Dataset
59
62
  def initialize(package_name, dataset_name)
60
- list = RdatasetsList.new
63
+ list = RdatasetList.new
61
64
 
62
65
  info = list.filter(package: package_name, dataset: dataset_name).first
63
66
  unless info
@@ -65,8 +68,8 @@ module Datasets
65
68
  end
66
69
 
67
70
  super()
68
- @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
69
- @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
71
+ @metadata.id = "rdataset-#{package_name}-#{dataset_name}"
72
+ @metadata.name = "Rdataset: #{package_name}: #{dataset_name}"
70
73
  @metadata.url = info.csv
71
74
  @metadata.licenses = ["GPL-3"]
72
75
  @metadata.description = info.title
@@ -81,15 +84,63 @@ module Datasets
81
84
  def each(&block)
82
85
  return to_enum(__method__) unless block_given?
83
86
 
84
- download(@data_path, @metadata.url) unless @data_path.exist?
85
- CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
86
- csv.each do |row|
87
- record = row.to_h
88
- record.delete("")
89
- record.transform_keys!(&:to_sym)
90
- yield record
87
+ download(@data_path, @metadata.url)
88
+
89
+ na_converter = lambda do |field|
90
+ begin
91
+ if field.encode(CSV::ConverterEncoding) == "NA"
92
+ nil
93
+ else
94
+ field
95
+ end
96
+ rescue
97
+ field
91
98
  end
92
99
  end
100
+
101
+ inf_converter = lambda do |field|
102
+ begin
103
+ if field.encode(CSV::ConverterEncoding) == "Inf"
104
+ Float::INFINITY
105
+ else
106
+ field
107
+ end
108
+ rescue
109
+ field
110
+ end
111
+ end
112
+
113
+ quote_preserving_converter = lambda do |field, info|
114
+ f = field.encode(CSV::ConverterEncoding)
115
+ return f if info.quoted?
116
+
117
+ begin
118
+ begin
119
+ begin
120
+ return DateTime.parse(f) if f.match?(DateTimeMatcher)
121
+ rescue
122
+ return Integer(f)
123
+ end
124
+ rescue
125
+ return Float(f)
126
+ end
127
+ rescue
128
+ field
129
+ end
130
+ end
131
+
132
+ table = CSV.table(@data_path,
133
+ header_converters: [:symbol_raw],
134
+ # quote_preserving_converter should be the last
135
+ converters: [na_converter, inf_converter, quote_preserving_converter])
136
+ table.delete(:"") # delete 1st column for indices.
137
+
138
+ table.each do |row|
139
+ yield row.to_h
140
+ end
93
141
  end
94
142
  end
143
+
144
+ # For backward compatibility
145
+ Rdatasets = Rdataset
95
146
  end
@@ -0,0 +1,90 @@
1
+ require "json"
2
+
3
+ module Datasets
4
+ class SeabornList < Dataset
5
+ def initialize
6
+ super
7
+ @metadata.id = "seaborn-data-list"
8
+ @metadata.name = "seaborn: data list"
9
+ @metadata.url = "https://github.com/mwaskom/seaborn-data"
10
+ # Treat as the same license as seaborn
11
+ @metadata.licenses = ["BSD-3-Clause"]
12
+ @metadata.description = "Datasets for seaborn examples."
13
+ end
14
+
15
+ def each(&block)
16
+ return to_enum(__method__) unless block_given?
17
+
18
+ data_path = cache_dir_path + "trees.json"
19
+ url = "https://api.github.com/repos/mwaskom/seaborn-data/git/trees/master"
20
+ download(data_path, url)
21
+
22
+ tree = JSON.parse(File.read(data_path))["tree"]
23
+ tree.each do |content|
24
+ path = content["path"]
25
+ next unless path.end_with?(".csv")
26
+ dataset = File.basename(path, ".csv")
27
+ record = {dataset: dataset}
28
+ yield record
29
+ end
30
+ end
31
+ end
32
+
33
+ class Seaborn < Dataset
34
+ URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
35
+
36
+ def initialize(name)
37
+ super()
38
+ @metadata.id = "seaborn-#{name}"
39
+ @metadata.name = "seaborn: #{name}"
40
+ @metadata.url = URL_FORMAT % {name: name}
41
+ # @metadata.licenses = TODO
42
+
43
+ @name = name
44
+ end
45
+
46
+ def each(&block)
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ data_path = cache_dir_path + "#{@name}.csv"
50
+ download(data_path, @metadata.url)
51
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
52
+ csv.each do |row|
53
+ record = prepare_record(row)
54
+ yield record
55
+ end
56
+ end
57
+ end
58
+
59
+ private
60
+ def prepare_record(csv_row)
61
+ record = csv_row.to_h
62
+ record.transform_keys! do |key|
63
+ if key.nil?
64
+ :index
65
+ else
66
+ key.to_sym
67
+ end
68
+ end
69
+
70
+ # Perform the same preprocessing as seaborn's load_dataset function
71
+ preprocessor = :"preprocess_#{@name}_record"
72
+ __send__(preprocessor, record) if respond_to?(preprocessor, true)
73
+
74
+ record
75
+ end
76
+
77
+ # The same preprocessing as seaborn.load_dataset
78
+ def preprocess_flights_record(record)
79
+ record[:month] &&= record[:month][0,3]
80
+ end
81
+
82
+ # The same preprocessing as seaborn.load_dataset
83
+ def preprocess_penguins_record(record)
84
+ record[:sex] &&= record[:sex].capitalize
85
+ end
86
+ end
87
+
88
+ # For backward compatibility
89
+ SeabornData = Seaborn
90
+ end
@@ -21,9 +21,7 @@ module Datasets
21
21
  @metadata.id = "sudachi-synonym-dictionary"
22
22
  @metadata.name = "Sudachi synonym dictionary"
23
23
  @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
24
- @metadata.licenses = [
25
- "Apache-2.0",
26
- ]
24
+ @metadata.licenses = ["Apache-2.0"]
27
25
  @metadata.description = lambda do
28
26
  download_description
29
27
  end
@@ -65,10 +63,8 @@ module Datasets
65
63
  private
66
64
  def open_data
67
65
  data_path = cache_dir_path + "synonyms.txt"
68
- unless data_path.exist?
69
- data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
70
- download(data_path, data_url)
71
- end
66
+ data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
67
+ download(data_path, data_url)
72
68
  CSV.open(data_path,
73
69
  encoding: "UTF-8",
74
70
  skip_blanks: true) do |csv|
@@ -78,10 +74,8 @@ module Datasets
78
74
 
79
75
  def download_description
80
76
  description_path = cache_dir_path + "synonyms.md"
81
- unless description_path.exist?
82
- description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
83
- download(description_path, description_url)
84
- end
77
+ description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
78
+ download(description_path, description_url)
85
79
  description_path.read
86
80
  end
87
81
 
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.5"
3
3
  end
@@ -0,0 +1,219 @@
1
+ require "csv"
2
+ require "rexml/streamlistener"
3
+ require "rexml/parsers/baseparser"
4
+ require "rexml/parsers/streamparser"
5
+ require "time"
6
+
7
+ require_relative "dataset"
8
+ require_relative "tar-gz-readable"
9
+
10
+ module Datasets
11
+ class WikipediaKyotoJapaneseEnglish < Dataset
12
+ include TarGzReadable
13
+
14
+ Article = Struct.new(:source,
15
+ :copyright,
16
+ :contents,
17
+ :sections)
18
+
19
+ Section = Struct.new(:id,
20
+ :title,
21
+ :contents)
22
+
23
+ class Title < Struct.new(:section,
24
+ :japanese,
25
+ :english)
26
+ def title?
27
+ true
28
+ end
29
+
30
+ def sentence?
31
+ false
32
+ end
33
+ end
34
+
35
+ Paragraph = Struct.new(:id,
36
+ :sentences)
37
+
38
+ class Sentence < Struct.new(:id,
39
+ :section,
40
+ :paragraph,
41
+ :japanese,
42
+ :english)
43
+ def title?
44
+ false
45
+ end
46
+
47
+ def sentence?
48
+ true
49
+ end
50
+ end
51
+
52
+ Entry = Struct.new(:japanese,
53
+ :english)
54
+
55
+ def initialize(type: :article)
56
+ unless [:article, :lexicon].include?(type)
57
+ raise ArgumentError, "Please set type :article or :lexicon: #{type.inspect}"
58
+ end
59
+
60
+ super()
61
+ @type = type
62
+ @metadata.id = "wikipedia-kyoto-japanese-english"
63
+ @metadata.name =
64
+ "The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
65
+ @metadata.url = "https://alaginrc.nict.go.jp/WikiCorpus/index_E.html"
66
+ @metadata.licenses = ["CC-BY-SA-3.0"]
67
+ @metadata.description = <<-DESCRIPTION
68
+ "The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
69
+ aims mainly at supporting research and development relevant to
70
+ high-performance multilingual machine translation, information
71
+ extraction, and other language processing technologies. The National
72
+ Institute of Information and Communications Technology (NICT) has
73
+ created this corpus by manually translating Japanese Wikipedia
74
+ articles (related to Kyoto) into English.
75
+ DESCRIPTION
76
+ end
77
+
78
+ def each(&block)
79
+ return to_enum(__method__) unless block_given?
80
+
81
+ data_path = download_tar_gz
82
+
83
+ open_tar_gz(data_path) do |tar|
84
+ tar.each do |entry|
85
+ next unless entry.file?
86
+ base_name = File.basename(entry.full_name)
87
+ case @type
88
+ when :article
89
+ next unless base_name.end_with?(".xml")
90
+ listener = ArticleListener.new(block)
91
+ parser = REXML::Parsers::StreamParser.new(entry.read, listener)
92
+ parser.parse
93
+ when :lexicon
94
+ next unless base_name == "kyoto_lexicon.csv"
95
+ is_header = true
96
+ CSV.parse(entry.read.force_encoding("UTF-8")) do |row|
97
+ if is_header
98
+ is_header = false
99
+ next
100
+ end
101
+ yield(Entry.new(row[0], row[1]))
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
107
+
108
+ private
109
+ def download_tar_gz
110
+ base_name = "wiki_corpus_2.01.tar.gz"
111
+ data_path = cache_dir_path + base_name
112
+ data_url = "https://alaginrc.nict.go.jp/WikiCorpus/src/#{base_name}"
113
+ download(data_path, data_url)
114
+ data_path
115
+ end
116
+
117
+ class ArticleListener
118
+ include REXML::StreamListener
119
+
120
+ def initialize(block)
121
+ @block = block
122
+ @article = nil
123
+ @title = nil
124
+ @section = nil
125
+ @page = nil
126
+ @sentence = nil
127
+ @text_container_stack = []
128
+ @element_stack = []
129
+ @text_stack = [""]
130
+ end
131
+
132
+ def tag_start(name, attributes)
133
+ push_stacks(name, attributes)
134
+ case name
135
+ when "art"
136
+ @article = Article.new
137
+ @article.contents = []
138
+ @article.sections = []
139
+ when "tit"
140
+ @title = Title.new
141
+ @title.section = @section
142
+ @text_container_stack.push(@title)
143
+ when "sec"
144
+ @section = Section.new
145
+ @section.id = attributes["id"]
146
+ @section.contents = []
147
+ @text_container_stack.push(@section)
148
+ when "par"
149
+ @paragraph = Paragraph.new
150
+ @paragraph.id = attributes["id"]
151
+ @paragraph.sentences = []
152
+ @text_container_stack.push(@paragraph)
153
+ when "sen"
154
+ @sentence = Sentence.new
155
+ @sentence.id = attributes["id"]
156
+ @text_container_stack.push(@sentence)
157
+ end
158
+ end
159
+
160
+ def tag_end(name)
161
+ case name
162
+ when "art"
163
+ @block.call(@article)
164
+ @article = nil
165
+ when "inf"
166
+ @article.source = @text_stack.last
167
+ when "copyright"
168
+ @article.copyright = @text_stack.last
169
+ when "tit"
170
+ @article.contents << @title
171
+ if @section
172
+ @section.title = @title
173
+ @section.contents << @title
174
+ end
175
+ @title = nil
176
+ @text_container_stack.pop
177
+ when "sec"
178
+ @article.sections << @section
179
+ @section = nil
180
+ @text_container_stack.pop
181
+ when "par"
182
+ @paragraph = nil
183
+ @text_container_stack.pop
184
+ when "sen"
185
+ @article.contents << @sentence
186
+ @sentence.section = @section
187
+ @section.contents << @sentence if @section
188
+ @sentence.paragraph = @paragraph
189
+ @paragraph.sentences << @sentence if @paragraph
190
+ @sentence = nil
191
+ @text_container_stack.pop
192
+ when "j"
193
+ @text_container_stack.last.japanese = @text_stack.last
194
+ when "e"
195
+ attributes = @element_stack.last[:attributes]
196
+ if attributes["type"] == "check"
197
+ @text_container_stack.last.english = @text_stack.last
198
+ end
199
+ end
200
+ pop_stacks
201
+ end
202
+
203
+ def text(data)
204
+ @text_stack.last << data
205
+ end
206
+
207
+ private
208
+ def push_stacks(name, attributes)
209
+ @element_stack.push({name: name, attributes: attributes})
210
+ @text_stack.push("")
211
+ end
212
+
213
+ def pop_stacks
214
+ @text_stack.pop
215
+ @element_stack.pop
216
+ end
217
+ end
218
+ end
219
+ end
@@ -1,6 +1,7 @@
1
1
  require "rexml/streamlistener"
2
2
  require "rexml/parsers/baseparser"
3
3
  require "rexml/parsers/streamparser"
4
+ require "time"
4
5
 
5
6
  require_relative "dataset"
6
7
 
@@ -55,10 +56,8 @@ module Datasets
55
56
  def open_data(&block)
56
57
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
58
  data_path = cache_dir_path + base_name
58
- unless data_path.exist?
59
- data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
60
- download(data_path, data_url)
61
- end
59
+ data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
60
+ download(data_path, data_url)
62
61
 
63
62
  extract_bz2(data_path, &block)
64
63
  end
@@ -153,7 +152,7 @@ module Datasets
153
152
  @text_stack.last << data
154
153
  end
155
154
 
156
- def cdata(contnet)
155
+ def cdata(content)
157
156
  @text_stack.last << content
158
157
  end
159
158
 
data/lib/datasets/wine.rb CHANGED
@@ -23,7 +23,8 @@ module Datasets
23
23
  super
24
24
  @metadata.id = 'wine'
25
25
  @metadata.name = 'Wine'
26
- @metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
26
+ @metadata.url = 'https://archive.ics.uci.edu/ml/datasets/wine'
27
+ @metadata.licenses = ["CC-BY-4.0"]
27
28
  @metadata.description = -> { read_names }
28
29
  end
29
30
 
@@ -43,19 +44,15 @@ module Datasets
43
44
 
44
45
  def read_names
45
46
  names_path = cache_dir_path + 'wine.names'
46
- unless names_path.exist?
47
- names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
- download(names_path, names_url)
49
- end
47
+ names_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
+ download(names_path, names_url)
50
49
  names_path.read
51
50
  end
52
51
 
53
52
  def open_data
54
53
  data_path = cache_dir_path + 'wine.data'
55
- unless data_path.exist?
56
- data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
57
- download(data_path, data_url)
58
- end
54
+ data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
55
+ download(data_path, data_url)
59
56
  CSV.open(data_path, converters: %i[numeric]) do |csv|
60
57
  yield(csv)
61
58
  end