RubyGems - red-datasets - Versions diffs - 0.1.7 → 0.1.9 - Mend

red-datasets 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/README.md +2 -0
data/Rakefile +10 -0
data/doc/text/news.md +36 -0
data/lib/datasets/california-housing.rb +1 -1
data/lib/datasets/dataset.rb +2 -2
data/lib/datasets/downloader.rb +51 -17
data/lib/datasets/fashion-mnist.rb +6 -2
data/lib/datasets/ggplot2-dataset.rb +3 -3
data/lib/datasets/house-of-councillor.rb +169 -0
data/lib/datasets/house-of-representative.rb +107 -0
data/lib/datasets/japanese-date-parser.rb +38 -0
data/lib/datasets/kuzushiji-mnist.rb +6 -2
data/lib/datasets/lazy.rb +2 -0
data/lib/datasets/libsvm-dataset-list.rb +1 -1
data/lib/datasets/mnist.rb +12 -6
data/lib/datasets/nagoya-university-conversation-corpus.rb +2 -2
data/lib/datasets/penguins.rb +28 -5
data/lib/datasets/postal-code-japan.rb +3 -3
data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
data/lib/datasets/wikipedia.rb +2 -2
data/test/japanese-date-parser-test.rb +27 -0
data/test/test-adult.rb +36 -86
data/test/test-aozora-bunko.rb +5 -5
data/test/test-california-housing.rb +12 -31
data/test/test-cldr-plurals.rb +1 -1
data/test/test-diamonds.rb +13 -33
data/test/test-downloader.rb +1 -1
data/test/test-geolonia.rb +17 -41
data/test/test-house-of-councillor.rb +223 -0
data/test/test-house-of-representative.rb +54 -0
data/test/test-nagoya-university-conversation-corpus.rb +17 -69
data/test/test-postal-code-japan.rb +7 -0
data/test/test-quora-duplicate-question-pair.rb +7 -21
data/test/test-rdataset.rb +24 -22
data/test/test-sudachi-synonym-dictionary.rb +12 -31
data/test/test-wikipedia.rb +5 -5
metadata +12 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0231a4f9da16ad1b2cb562a360468b3450fec684e2bcf0ca500195499e8f7397
-  data.tar.gz: c2938b6d72fea58413a743ccad111fc8c95699d9b417e64941ca1681128a1706
+  metadata.gz: 01ddaa57da3c64de47cfd9eb2ca9ae2ec3cbcb5d35138fdd74f74009f062358f
+  data.tar.gz: 431dba2c0e41bc25a4e2716ed20936ee2022c2b04c49683bb7d0d2e2aaa2f99e
 SHA512:
-  metadata.gz: 2ea402beb78be117e28ca906490526a28a8d1e1430181a89432953d26eb0b0a8ea70d0c582d438c5e6b668b3a48eb60db336ef8174fa4dd1a52e20c19f5b4d9b
-  data.tar.gz: 38b59c46e875ae61ab0794f2f9f474969b7e4e08e06078ca55ce4d1930d9538647e6b0152c1c48e8f0d45318ed2f71801dc99152d0799e06459937fb3d29978d
+  metadata.gz: ab12e9783e4a23b81f9bd1be22c31704f9095026cb27185a6fe985e106320982fb999e1cf2348f6b18b509a7f1a6a5b58d405ece5d541e8ddbe43cd08f252a80
+  data.tar.gz: 157df5fffd3ba8fd021cdef3933c0a9e99a0fa7f173771e2177d1a86e79788c3250b12453f06084e24afcf624ec3283e702e6bd7595f08e2cf2b5a7dd404065c

data/README.md CHANGED Viewed

@@ -29,6 +29,8 @@ You can use datasets easily because you can access each dataset with multiple wa
 * Fuel Economy Dataset
 * Geolonia Japanese Addresses
 * Hepatitis
+* House of Councillors of Japan
+* House of Representatives of Japan
 * Iris Dataset
 * Libsvm
 * MNIST database

data/Rakefile CHANGED Viewed

@@ -13,6 +13,16 @@ end
 helper.install
 spec = helper.gemspec
+release_task = Rake.application["release"]
+# We use Trusted Publishing.
+release_task.prerequisites.delete("build")
+release_task.prerequisites.delete("release:rubygem_push")
+release_task_comment = release_task.comment
+if release_task_comment
+  release_task.clear_comments
+  release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
+end
 task default: :test
 desc "Run tests"

data/doc/text/news.md CHANGED Viewed

@@ -1,5 +1,41 @@
 # News
+## 0.1.9 - 2025-04-08
+### Improvements
+  * `Datasets::Penguins`: Changed to use `POST` for downloading data
+    from EDI.
+## 0.1.8 - 2025-02-07
+### Improvements
+  * Suppressed "literal string will be frozen" warnings.
+    * Patch by Tsutomu Katsube
+  * `Datasets::HouseOfCouncillor`: Added.
+    * [GH-143](https://github.com/red-data-tools/red-datasets/issues/143)
+    * [GH-181](https://github.com/red-data-tools/red-datasets/issues/181)
+    * Patch by Tsutomu Katsube
+  * `Datasets::HouseOfRepresentative`: Added.
+    * [GH-143](https://github.com/red-data-tools/red-datasets/issues/142)
+    * [GH-181](https://github.com/red-data-tools/red-datasets/issues/184)
+    * Patch by Tsutomu Katsube
+### Thanks
+  * Tsutomu Katsube
 ## 0.1.7 - 2023-05-29
 ### Improvements

data/lib/datasets/california-housing.rb CHANGED Viewed

@@ -36,7 +36,7 @@ Available from http://lib.stat.cmu.edu/datasets/.
       file_name = "cadata.txt"
       download(data_path, data_url)
       open_data(data_path, file_name) do |input|
-        data = ""
+        data = +""
         input.each_line do |line|
           next unless line.start_with?(" ")
           data << line.lstrip.gsub(/ +/, ",")

data/lib/datasets/dataset.rb CHANGED Viewed

@@ -33,8 +33,8 @@ module Datasets
       @cache_path ||= CachePath.new(@metadata.id)
     end
-    def download(output_path, url, &block)
-      downloader = Downloader.new(url)
+    def download(output_path, url, *fallback_urls, **options, &block)
+      downloader = Downloader.new(url, *fallback_urls, **options)
       downloader.download(output_path, &block)
     end

data/lib/datasets/downloader.rb CHANGED Viewed

@@ -6,20 +6,17 @@ end
 require "net/http"
 require "pathname"
+require_relative "error"
 module Datasets
   class Downloader
-    class TooManyRedirects < StandardError; end
+    class TooManyRedirects < Error; end
-    def initialize(url)
-      if url.is_a?(URI::Generic)
-        url = url.dup
-      else
-        url = URI.parse(url)
-      end
-      @url = url
-      unless @url.is_a?(URI::HTTP)
-        raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
-      end
+    def initialize(url, *fallback_urls, http_method: nil, http_parameters: nil)
+      @url = normalize_url(url)
+      @fallback_urls = fallback_urls.collect { |fallback_url| normalize_url(fallback_url) }
+      @http_method = http_method
+      @http_parameters = http_parameters
     end
     def download(output_path, &block)
@@ -45,7 +42,7 @@ module Datasets
             headers["Range"] = "bytes=#{start}-"
           end
-          start_http(@url, headers) do |response|
+          start_http(@url, @fallback_urls, headers) do |response|
             if response.is_a?(Net::HTTPPartialContent)
               mode = "ab"
             else
@@ -85,6 +82,18 @@ module Datasets
       end
     end
+    private def normalize_url(url)
+      if url.is_a?(URI::Generic)
+        url = url.dup
+      else
+        url = URI.parse(url)
+      end
+      unless url.is_a?(URI::HTTP)
+        raise ArgumentError, "download URL must be HTTP or HTTPS: <#{url}>"
+      end
+      url
+    end
     private def synchronize(output_path, partial_output_path)
       begin
         Process.getpgid(Process.pid)
@@ -104,7 +113,8 @@ module Datasets
           rescue ArgumentError
             # The process that acquired the lock will be exited before
             # it stores its process ID.
-            valid_lock_path = (lock_path.mtime > 10)
+            elapsed_time = Time.now - lock_path.mtime
+            valid_lock_path = (elapsed_time > 10)
           else
             begin
               Process.getpgid(pid)
@@ -133,7 +143,7 @@ module Datasets
       end
     end
-    private def start_http(url, headers, limit = 10, &block)
+    private def start_http(url, fallback_urls, headers, limit = 10, &block)
       if limit == 0
         raise TooManyRedirects, "too many redirections: #{url}"
       end
@@ -143,7 +153,21 @@ module Datasets
       http.start do
         path = url.path
         path += "?#{url.query}" if url.query
-        request = Net::HTTP::Get.new(path, headers)
+        if @http_method == :post
+          # TODO: We may want to add @http_content_type, @http_body
+          # and so on.
+          if @http_parameters
+            body = URI.encode_www_form(@http_parameters)
+            content_type = "application/x-www-form-urlencoded"
+            headers = {"Content-Type" => content_type}.merge(headers)
+          else
+            body = ""
+          end
+          request = Net::HTTP::Post.new(path, headers)
+          request.body = body
+        else
+          request = Net::HTTP::Get.new(path, headers)
+        end
         http.request(request) do |response|
           case response
           when Net::HTTPSuccess, Net::HTTPPartialContent
@@ -151,8 +175,18 @@ module Datasets
           when Net::HTTPRedirection
             url = URI.parse(response[:location])
             $stderr.puts "Redirect to #{url}"
-            return start_http(url, headers, limit - 1, &block)
+            return start_http(url, fallback_urls, headers, limit - 1, &block)
           else
+            if response.is_a?(Net::HTTPForbidden)
+              next_url, *rest_fallback_urls = fallback_urls
+              if next_url
+                message = "#{response.code}: #{response.message}: " +
+                          "fallback: <#{url}> -> <#{next_url}>"
+                $stderr.puts(message)
+                return start_http(next_url, rest_fallback_urls, headers, &block)
+              end
+            end
             message = response.code
             if response.message and not response.message.empty?
               message += ": #{response.message}"
@@ -167,7 +201,7 @@ module Datasets
     private def yield_chunks(path)
       path.open("rb") do |output|
         chunk_size = 1024 * 1024
-        chunk = ""
+        chunk = +""
         while output.read(chunk_size, chunk)
           yield(chunk)
         end

data/lib/datasets/fashion-mnist.rb CHANGED Viewed

@@ -2,9 +2,13 @@ require_relative 'mnist'
 module Datasets
   class FashionMNIST < MNIST
-    BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
     private
+    def base_urls
+      [
+        "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
+      ]
+    end
     def dataset_name
       "Fashion-MNIST"
     end

data/lib/datasets/ggplot2-dataset.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module Datasets
       data_path = cache_dir_path + data_base_name
       data_url = "#{download_base_url}/data-raw/#{data_base_name}"
       download(data_path, data_url)
-      CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
+      CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv|
         record_class = self.class::Record
         csv.each do |row|
           record = record_class.new(*row.fields)
@@ -37,7 +37,7 @@ module Datasets
       data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
       download(data_r_path, data_r_url)
       descriptions = {}
-      comment = ""
+      comment = +""
       File.open(data_r_path) do |data_r|
         data_r.each_line do |line|
           case line.chomp
@@ -51,7 +51,7 @@ module Datasets
           when /\A"(.+)"\z/
             name = Regexp.last_match[1]
             descriptions[name] = parse_roxygen(comment.rstrip)
-            comment = ""
+            comment = +""
           end
         end
         descriptions[@ggplot2_dataset_name]

data/lib/datasets/house-of-councillor.rb ADDED Viewed

@@ -0,0 +1,169 @@
+require_relative "dataset"
+module Datasets
+  class HouseOfCouncillor < Dataset
+    Bill = Struct.new(:council_time,
+                      :bill_type,
+                      :submit_time,
+                      :submit_number,
+                      :title,
+                      :bill_url,
+                      :bill_summary_url,
+                      :proposed_bill_url,
+                      :proposed_on,
+                      :proposed_on_from_house_of_representatives,
+                      :proposed_on_to_house_of_representatives,
+                      :prior_deliberations_type,
+                      :continuation_type,
+                      :proposers,
+                      :submitter,
+                      :submitter_type,
+                      :progress_of_house_of_councillors_committees_etc_refer_on,
+                      :progress_of_house_of_councillors_committees_etc_committee_etc,
+                      :progress_of_house_of_councillors_committees_etc_pass_on,
+                      :progress_of_house_of_councillors_committees_etc_result,
+                      :progress_of_house_of_councillors_plenary_sitting_pass_on,
+                      :progress_of_house_of_councillors_plenary_sitting_result,
+                      :progress_of_house_of_councillors_plenary_sitting_committees,
+                      :progress_of_house_of_councillors_plenary_sitting_vote_type,
+                      :progress_of_house_of_councillors_plenary_sitting_vote_method,
+                      :progress_of_house_of_councillors_plenary_sitting_result_url,
+                      :progress_of_house_of_representatives_committees_etc_refer_on,
+                      :progress_of_house_of_representatives_committees_etc_committee_etc,
+                      :progress_of_house_of_representatives_committees_etc_pass_on,
+                      :progress_of_house_of_representatives_committees_etc_result,
+                      :progress_of_house_of_representatives_plenary_sitting_pass_on,
+                      :progress_of_house_of_representatives_plenary_sitting_result,
+                      :progress_of_house_of_representatives_plenary_sitting_committees,
+                      :progress_of_house_of_representatives_plenary_sitting_vote_type,
+                      :progress_of_house_of_representatives_plenary_sitting_vote_method,
+                      :promulgated_on,
+                      :law_number,
+                      :entracted_law_url,
+                      :notes)
+    InHouseGroup = Struct.new(:in_house_group_name_and_abbreviation_on,
+                              :in_house_group_name,
+                              :in_house_group_abbreviation,
+                              :number_of_members_on,
+                              :number_of_members,
+                              :number_of_women_members,
+                              :first_term_expires_on,
+                              :first_term_proportional_representation_number_of_members,
+                              :first_term_proportional_representation_number_of_women_members,
+                              :first_term_election_district_number_of_members,
+                              :first_term_election_district_number_of_women_members,
+                              :first_term_total_number_of_members,
+                              :first_term_total_number_of_women_members,
+                              :second_term_expires_on,
+                              :second_term_proportional_representation_number_of_members,
+                              :second_term_proportional_representation_number_of_women_members,
+                              :second_term_election_district_number_of_members,
+                              :second_term_election_district_number_of_women_members,
+                              :second_term_total_number_of_members,
+                              :second_term_total_number_of_women_members)
+    Member = Struct.new(:professional_name,
+                        :true_name,
+                        :profile_url,
+                        :professional_name_reading,
+                        :in_house_group_abbreviation,
+                        :constituency,
+                        :expiration_of_term,
+                        :photo_url,
+                        :elected_years,
+                        :elected_number,
+                        :responsibilities,
+                        :responsibility_on,
+                        :career,
+                        :career_on)
+    Question = Struct.new(:submit_time,
+                          :submit_number,
+                          :title,
+                          :submitter,
+                          :number_of_submissions,
+                          :question_for_text_html_url,
+                          :answer_for_text_html_url,
+                          :question_for_text_pdf_url,
+                          :answer_for_text_pdf_url,
+                          :question_url,
+                          :submitted_on,
+                          :transfered_on,
+                          :received_answer_on,
+                          :notes)
+    VALID_TYPES = [
+      :bill,
+      :in_house_group,
+      :member,
+      :question
+    ]
+    def initialize(type: :bill)
+      super()
+      @type = type
+      unless VALID_TYPES.include?(type)
+        message = +":type must be one of ["
+        message << VALID_TYPES.collect(&:inspect).join(", ")
+        message << "]: #{@type.inspect}"
+        raise ArgumentError, message
+      end
+      @metadata.id = "house-of-councillor"
+      @metadata.name = "Bill, in-House group, member and question of the House of Councillors of Japan"
+      @metadata.url = "https://smartnews-smri.github.io/house-of-councillors"
+      @metadata.licenses = ["MIT"]
+      @metadata.description = "The House of Councillors of Japan (type: #{@type})"
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv|
+        csv.each do |row|
+          case @type
+          when :bill
+            record = Bill.new(*row.fields)
+          when :in_house_group
+            record = InHouseGroup.new(*row.fields)
+          when :member
+            %w(当選年).each do |ints_column_name|
+              row[ints_column_name] = parse_ints(row[ints_column_name])
+            end
+            record = Member.new(*row.fields)
+          when :question
+            record = Question.new(*row.fields)
+          end
+          yield(record)
+        end
+      end
+    end
+    private
+    def open_data
+      data_url = +"https://smartnews-smri.github.io/house-of-councillors/data"
+      case @type
+      when :bill
+        data_url << "/gian.csv"
+      when :in_house_group
+        data_url << "/kaiha.csv"
+      when :member
+        data_url << "/giin.csv"
+      when :question
+        data_url << "/syuisyo.csv"
+      end
+      data_path = cache_dir_path + "#{@type}.csv"
+      download(data_path, data_url)
+      CSV.open(data_path, col_sep: ",", headers: true, converters: %i(date integer)) do |csv|
+        yield(csv)
+      end
+    end
+    def parse_ints(column_value)
+      column_value.to_s.split("、").collect(&:to_i)
+    end
+  end
+end

data/lib/datasets/house-of-representative.rb ADDED Viewed

@@ -0,0 +1,107 @@
+require_relative "dataset"
+require_relative "japanese-date-parser"
+module Datasets
+  class HouseOfRepresentative < Dataset
+    Record = Struct.new(:carry_time,
+                        :caption,
+                        :type,
+                        :submit_time,
+                        :submit_number,
+                        :title,
+                        :discussion_status,
+                        :progress,
+                        :progress_url,
+                        :text,
+                        :text_url,
+                        :bill_type,
+                        :submitter,
+                        :submitter_in_house_groups,
+                        :house_of_representatives_of_accepted_bill_on_preliminary_consideration,
+                        :house_of_representatives_of_preliminary_refer_on,
+                        :house_of_representatives_of_preliminary_refer_commission,
+                        :house_of_representatives_of_accepted_bill_on,
+                        :house_of_representatives_of_refer_on,
+                        :house_of_representatives_of_refer_commission,
+                        :house_of_representatives_of_finished_consideration_on,
+                        :house_of_representatives_of_consideration_result,
+                        :house_of_representatives_of_finished_deliberation_on,
+                        :house_of_representatives_of_deliberation_result,
+                        :house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
+                        :house_of_representatives_of_support_in_house_group_during_deliberation,
+                        :house_of_representatives_of_opposition_in_house_group_during_deliberation,
+                        :house_of_councillors_of_accepted_bill_on_preliminary_consideration,
+                        :house_of_councillors_of_preliminary_refer_on,
+                        :house_of_councillors_of_preliminary_refer_commission,
+                        :house_of_councillors_of_accepted_bill_on,
+                        :house_of_councillors_of_refer_on,
+                        :house_of_councillors_of_refer_commission,
+                        :house_of_councillors_of_finished_consideration_on,
+                        :house_of_councillors_of_consideration_result,
+                        :house_of_councillors_of_finished_deliberation_on,
+                        :house_of_councillors_of_deliberation_result,
+                        :promulgated_on,
+                        :law_number,
+                        :submitters,
+                        :supporters_of_submitted_bill)
+    def initialize
+      super()
+      @metadata.id = "house-of-representative"
+      @metadata.name = "Bill of the House of Representatives of Japan"
+      @metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
+      @metadata.licenses = ["MIT"]
+      @metadata.description = "Bill of the House of Representatives of Japan"
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv|
+        csv.each do |row|
+          record = Record.new(*row.fields)
+          yield(record)
+        end
+      end
+    end
+    private
+    def open_data
+      data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
+      data_path = cache_dir_path + "gian.csv"
+      download(data_path, data_url)
+      parser = JapaneseDateParser.new
+      japanese_date_converter = lambda do |field, info|
+        if info.header.end_with?("年月日")
+          parser.parse(field)
+        else
+          field
+        end
+      end
+      array_converter = lambda do |field, info|
+        case info.header
+        when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者"
+          parse_array(field)
+        else
+          field
+        end
+      end
+      File.open(data_path) do |data_file|
+        options = {
+          col_sep: ",",
+          headers: true,
+          converters: [:integer, japanese_date_converter, array_converter],
+        }
+        # There are two columns within one column. To split into two columns, `#gsub` is necessary.
+        yield(CSV.new(data_file.read.gsub("／", ","), **options))
+      end
+    end
+    def parse_array(column_value)
+      column_value&.split("; ")
+    end
+  end
+end

data/lib/datasets/japanese-date-parser.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module Datasets
+  class JapaneseDateParser
+    class UnsupportedEraInitialRange < Error; end
+    ERA_INITIALS = {
+      "平成" => "H",
+      "令和" => "R",
+    }.freeze
+    def parse(string)
+      case string
+      when nil
+        nil
+      when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
+        match_data = Regexp.last_match
+        era_initial = ERA_INITIALS[match_data[1]]
+        if era_initial.nil?
+          message = +"era must be one of ["
+          message << ERA_INITIALS.keys.join(", ")
+          message << "]: #{match_data[1]}"
+          raise UnsupportedEraInitialRange, message
+        end
+        year = match_data[2]
+        if year == "元"
+          year = "01"
+        else
+          year = year.rjust(2, "0")
+        end
+        month = match_data[3].rjust(2, "0")
+        day = match_data[4].rjust(2, "0")
+        Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
+      else
+        string
+      end
+    end
+  end
+end

data/lib/datasets/kuzushiji-mnist.rb CHANGED Viewed

@@ -2,9 +2,13 @@ require_relative 'mnist'
 module Datasets
   class KuzushijiMNIST < MNIST
-    BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
     private
+    def base_urls
+      [
+        "http://codh.rois.ac.jp/kmnist/dataset/kmnist/",
+      ]
+    end
     def dataset_name
       "Kuzushiji-MNIST"
     end

data/lib/datasets/lazy.rb CHANGED Viewed

@@ -57,6 +57,8 @@ module Datasets
   LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
   LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
   LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
+  LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
+  LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
   LAZY_LOADER.register(:Iris, "datasets/iris")
   LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
   LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")

data/lib/datasets/libsvm-dataset-list.rb CHANGED Viewed

@@ -110,7 +110,7 @@ module Datasets
           @row = []
         when "td"
           @in_td = true
-          @row << {:text => ""}
+          @row << {:text => +""}
         when "a"
           @row.last[:href] = attributes["href"] if @in_td
         end

data/lib/datasets/mnist.rb CHANGED Viewed

@@ -4,8 +4,6 @@ require_relative "dataset"
 module Datasets
   class MNIST < Dataset
-    BASE_URL = "http://yann.lecun.com/exdb/mnist/"
     class Record < Struct.new(:data, :label)
       def pixels
         data.unpack("C*")
@@ -27,7 +25,7 @@ module Datasets
       @metadata.id = "#{dataset_name.downcase}-#{type}"
       @metadata.name = "#{dataset_name}: #{type}"
-      @metadata.url = self.class::BASE_URL
+      @metadata.url = base_urls.first
       @metadata.licenses = licenses
       @type = type
@@ -44,15 +42,23 @@ module Datasets
       image_path = cache_dir_path + target_file(:image)
       label_path = cache_dir_path + target_file(:label)
-      base_url = self.class::BASE_URL
-      download(image_path, base_url + target_file(:image))
-      download(label_path, base_url + target_file(:label))
+      download(image_path,
+               *base_urls.collect { |base_url| base_url + target_file(:image) })
+      download(label_path,
+               *base_urls.collect { |base_url| base_url + target_file(:label) })
       open_data(image_path, label_path, &block)
     end
     private
+    def base_urls
+      [
+        "http://yann.lecun.com/exdb/mnist/",
+        "https://ossci-datasets.s3.amazonaws.com/mnist/",
+      ]
+    end
     def licenses
       []
     end