RubyGems - red-datasets - Versions diffs - 0.1.4 → 0.1.6 - Mend

red-datasets 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

checksums.yaml +4 -4
data/README.md +23 -3
data/Rakefile +56 -1
data/doc/text/news.md +102 -0
data/lib/datasets/adult.rb +6 -9
data/lib/datasets/afinn.rb +48 -0
data/lib/datasets/aozora-bunko.rb +196 -0
data/lib/datasets/cache-path.rb +28 -0
data/lib/datasets/california-housing.rb +60 -0
data/lib/datasets/cifar.rb +2 -4
data/lib/datasets/cldr-plurals.rb +2 -4
data/lib/datasets/communities.rb +5 -8
data/lib/datasets/dataset.rb +58 -23
data/lib/datasets/diamonds.rb +26 -0
data/lib/datasets/downloader.rb +110 -30
data/lib/datasets/e-stat-japan.rb +2 -1
data/lib/datasets/fashion-mnist.rb +4 -0
data/lib/datasets/fuel-economy.rb +35 -0
data/lib/datasets/geolonia.rb +67 -0
data/lib/datasets/ggplot2-dataset.rb +79 -0
data/lib/datasets/hepatitis.rb +5 -8
data/lib/datasets/iris.rb +5 -8
data/lib/datasets/ita-corpus.rb +57 -0
data/lib/datasets/kuzushiji-mnist.rb +16 -0
data/lib/datasets/lazy.rb +90 -0
data/lib/datasets/libsvm-dataset-list.rb +5 -8
data/lib/datasets/libsvm.rb +3 -4
data/lib/datasets/license.rb +26 -0
data/lib/datasets/livedoor-news.rb +80 -0
data/lib/datasets/metadata.rb +14 -0
data/lib/datasets/mnist.rb +7 -7
data/lib/datasets/mushroom.rb +5 -8
data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
data/lib/datasets/penguins.rb +6 -8
data/lib/datasets/penn-treebank.rb +2 -4
data/lib/datasets/pmjt-dataset-list.rb +67 -0
data/lib/datasets/postal-code-japan.rb +2 -6
data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
data/lib/datasets/seaborn.rb +90 -0
data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
data/lib/datasets/wikipedia.rb +16 -8
data/lib/datasets/wine.rb +6 -9
data/lib/datasets/zip-extractor.rb +48 -0
data/lib/datasets.rb +2 -22
data/red-datasets.gemspec +1 -1
data/test/helper.rb +21 -0
data/test/test-afinn.rb +60 -0
data/test/test-aozora-bunko.rb +190 -0
data/test/test-california-housing.rb +56 -0
data/test/test-cldr-plurals.rb +1 -1
data/test/test-dataset.rb +15 -7
data/test/test-diamonds.rb +71 -0
data/test/test-fuel-economy.rb +75 -0
data/test/test-geolonia.rb +65 -0
data/test/test-ita-corpus.rb +69 -0
data/test/test-kuzushiji-mnist.rb +137 -0
data/test/test-license.rb +24 -0
data/test/test-livedoor-news.rb +351 -0
data/test/test-metadata.rb +36 -0
data/test/test-nagoya-university-conversation-corpus.rb +132 -0
data/test/test-penguins.rb +1 -1
data/test/test-pmjt-dataset-list.rb +50 -0
data/test/test-quora-duplicate-question-pair.rb +33 -0
data/test/test-rdataset.rb +246 -0
data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
data/test/test-sudachi-synonym-dictionary.rb +5 -5
data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
data/test/test-wikipedia.rb +25 -71
metadata +62 -14
data/lib/datasets/seaborn-data.rb +0 -49
data/test/test-rdatasets.rb +0 -136

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8d18fa976f1b368a6a3f9cc85dc7a58a1785fd02901157672484f2a7d8b1fa88
-  data.tar.gz: c91d651a0d8de6722ee759ce29545f5f382d1e9f060c7e4ee5a0fcd557be4d21
+  metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
+  data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
 SHA512:
-  metadata.gz: 0ff7694dd27e4293206de81fc2a7b5ccccb886579ed73eb7f97d390472692ce310993e2ece741cf85f5fbe265f1deb2a7ea326590383b4bdf0d3f77f10b1bbc1
-  data.tar.gz: 38ac6aa12d3e33ab0c26c0750273b60386d90fd4d916776a0d561c3f25a79fa2d7d216ac465842207cd65f62e2fcbd348389e65f905583187fe23c30908d92dc
+  metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
+  data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252

data/README.md CHANGED Viewed

@@ -1,6 +1,5 @@
 # Red Datasets
-[![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
 [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
 ## Description
@@ -17,15 +16,30 @@ You can use datasets easily because you can access each dataset with multiple wa
 ## Available datasets
-TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
 * Adult Dataset
+* Aozora Bunko
+* California Housing
 * CIFAR-10 Dataset
 * CIFAR-100 Dataset
+* CLDR language plural rules
+* Communities and crime
+* Diamonds Dataset
+* E-Stat Japan
 * Fashion-MNIST
+* Fuel Economy Dataset
+* Geolonia Japanese Addresses
+* Hepatitis
 * Iris Dataset
+* Libsvm
 * MNIST database
+* Mushroom
+* Penguins
 * The Penn Treebank Project
+* PMJT - Pre-Modern Japanese Text dataset list
+* Postal Codes in Japan
+* Rdatasets
+* Seaborn
+* Sudachi Synonym Dictionary
 * Wikipedia
 * Wine Dataset
@@ -135,6 +149,12 @@ end
 * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
+## How to develop Red Datasets
+1. Fork https://github.com/red-data-tools/red-datasets
+2. Create a feature branch from master
+3. Develop in the feature branch
+4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
 ## License
 The MIT license. See `LICENSE.txt` for details.

data/Rakefile CHANGED Viewed

@@ -13,9 +13,64 @@ end
 helper.install
 spec = helper.gemspec
+task default: :test
 desc "Run tests"
 task :test do
   ruby("test/run-test.rb")
 end
-task default: :test
+desc "Generate an artifact for GitHub Pages"
+task :pages do
+  pages_dir = "_site"
+  rm_rf(pages_dir)
+  mkdir_p(pages_dir)
+  require "cgi/util"
+  require_relative "lib/datasets/lazy"
+  File.open("#{pages_dir}/index.html", "w") do |index_html|
+    index_html.puts(<<-HTML)
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="UTF-8">
+    <title>Red Datasets</title>
+    <style>
+      table {
+        margin-left: 20vw;
+        min-width: 50%;
+      }
+      th {
+        font-size: 30px;
+        padding: 20px;
+      }
+      td {
+        border-bottom: 1px solid #D9DCE0;
+        padding: 20px;
+        font-weight: bold;
+      }
+    </style>
+  </head>
+  <body>
+    <section>
+      <h1>Red Datasets</h1>
+      <table>
+        <thead>
+          <tr><th>Available datasets</th></tr>
+        </thead>
+        <tbody>
+    HTML
+    Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
+      index_html.puts(<<-HTML)
+          <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
+      HTML
+    end
+    index_html.puts(<<-HTML)
+        </tbody>
+      </table>
+    </section>
+  </body>
+</html>
+    HTML
+  end
+end

data/doc/text/news.md CHANGED Viewed

@@ -1,5 +1,107 @@
 # News
+## 0.1.6 - 2023-05-24
+### Improvements
+  * Added support for lazy loading by `require "datasets/lazy"`.
+  * `Datasets::NagoyaUniversityConversationCorpus`: Added.
+    [GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
+    [Patch by matsuura]
+  * `Datasets::Wikipedia`: Added support for downloading in background.
+### Thanks
+  * matsuura
+## 0.1.5 - 2022-09-22
+### Improvements
+  * `Datasets::PMJTDatasetList`: Added.
+    [GitHub#107][Patch by okadak]
+  * `Datasets::AozoraBunko`: Added.
+    [GitHub#108][Patch by Masa]
+  * Added how to develop to README
+    [GitHub#117][Patch by abcdefg-1234567]
+  * `Datasets::FuelEconomy`: Added.
+    [GitHub#114][Patch by Benson Muite]
+  * `Datasets::Geolonia`: Added.
+    [GitHub#118][Patch by abcdefg-1234567]
+  * `Datasets::Diamonds`: Added.
+    [GitHub#110][Patch by Benson Muite]
+  * `Datasets::ITACorpus`: Added.
+    [GitHub#119][Patch by abcdefg-1234567]
+  * `Datasets::KuzushijiMNIST`: Added.
+    [GitHub#125][Patch by abcdefg-1234567]
+  * Updated list of datasets in README.
+    [GitHub#129][Patch by Benson Muite]
+  * `Datasets::CaliforniaHousing`: Added.
+    [GitHub#123][Patch by Benson Muite]
+  * Added support for Ruby 3.1.
+    [GitHub#130][Patch by Benson Muite]
+  * `Datasets::AFINN`: Added.
+    [GitHub#120][Patch by Benson Muite]
+  * `Datasets::LivedoorNews`: Added.
+    [GitHub#127][Patch by abcdefg-1234567]
+  * `Datasets::SeabornDataList`: Added.
+    [GitHub#134][Patch by Hirokazu SUZUKI]
+  * `Datasets::WikipediaKyotoJapaneseEnglish`: Added.
+    [GitHub#135][Patch by abcdefg-1234567]
+  * Renamed Rdatasets to Rdataset.
+    [GitHub#148][Patch by Hirokazu SUZUKI]
+  * Removed support for Ruby 2.6.
+  * Add missing license information.
+  * `Datasets::QuoraDuplicateQuestionPair`: Added.
+    [GitHub#149][Patch by otegami]
+### Fixes
+  * Fixed key from nil to :index in `Datasets::SeabornData`.
+    [GitHub#133][Patch by Hirokazu SUZUKI]
+  * Fixed `Datasets::Rdatasets#each` to change "NA" to nil.
+    [GitHub#139][Patch by Hirokazu SUZUKI]
+  * Fix `Datasets::Rdatasets#each` with mixed data of numeric and string.
+    [GitHub#140][Patch by Hirokazu SUZUKI]
+### Thanks
+  * okadak
+  * Masa
+  * Benson Muite
+  * abcdefg-1234567
+  * Hirokazu SUZUKI
+  * Sutou Kouhei
+  * otegami
 ## 0.1.4 - 2021-07-13
 ### Improvements

data/lib/datasets/adult.rb CHANGED Viewed

@@ -31,7 +31,8 @@ module Datasets
       @type = type
       @metadata.id = "adult-#{@type}"
       @metadata.name = "Adult: #{@type}"
-      @metadata.url = "http://archive.ics.uci.edu/ml/datasets/adult"
+      @metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
+      @metadata.licenses = ["CC-BY-4.0"]
       @metadata.description = lambda do
         read_names
       end
@@ -58,10 +59,8 @@ module Datasets
         ext = "test"
       end
       data_path = cache_dir_path + "adult-#{ext}.csv"
-      unless data_path.exist?
-        data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
-        download(data_path, data_url)
-      end
+      data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
+      download(data_path, data_url)
       options = {
                  converters: [:numeric, lambda {|f| f.strip}],
@@ -74,10 +73,8 @@ module Datasets
     def read_names
       names_path = cache_dir_path + "adult.names"
-      unless names_path.exist?
-        names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
-        download(names_path, names_url)
-      end
+      names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
+      download(names_path, names_url)
       names_path.read
     end
   end

data/lib/datasets/afinn.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require "csv"
+require_relative "zip-extractor"
+module Datasets
+  class AFINN < Dataset
+    Record = Struct.new(:word,
+                        :valence)
+    def initialize
+      super()
+      @metadata.id = "afinn"
+      @metadata.name = "AFINN"
+      @metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
+      @metadata.licenses = ["ODbL-1.0"]
+      @metadata.description = lambda do
+        extract_file("AFINN/AFINN-README.txt") do |input|
+          readme = input.read
+          readme.force_encoding("UTF-8")
+          readme.
+            gsub(/^AFINN-96:.*?\n\n/m, "").
+            gsub(/^In Python.*$/m, "").
+            strip
+        end
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      extract_file("AFINN/AFINN-111.txt") do |input|
+        csv = CSV.new(input, col_sep: "\t", converters: :numeric)
+        csv.each do |row|
+          yield(Record.new(*row))
+        end
+      end
+    end
+    private
+    def extract_file(file_path, &block)
+      data_path = cache_dir_path + "imm6010.zip"
+      data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
+      download(data_path, data_url)
+      extractor = ZipExtractor.new(data_path)
+      extractor.extract_file(file_path, &block)
+    end
+  end
+end

data/lib/datasets/aozora-bunko.rb ADDED Viewed

@@ -0,0 +1,196 @@
+require_relative 'dataset'
+require_relative 'zip-extractor'
+module Datasets
+  # Dataset for AozoraBunko
+  class AozoraBunko < Dataset
+    Book = Struct.new(
+      # 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
+      :title_id,
+      :title,
+      :title_reading,
+      :title_reading_collation,
+      :subtitle,
+      :subtitle_reading,
+      :original_title,
+      :first_appearance,
+      :ndc_code, # 分類番号(日本十進分類法の番号)
+      :syllabary_spelling_type,
+      :copyrighted,
+      :published_date,
+      :last_updated_date,
+      :detail_url,
+      # 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
+      :person_id,
+      :person_family_name,
+      :person_first_name,
+      :person_family_name_reading,
+      :person_first_name_reading,
+      :person_family_name_reading_collation,
+      :person_first_name_reading_collation,
+      :person_family_name_romaji,
+      :person_first_name_romaji,
+      :person_type,
+      :person_birthday,
+      :person_date_of_death,
+      :person_copyrighted,
+      # 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
+      :original_book_name1,
+      :original_book_publisher_name1,
+      :original_book_first_published_date1,
+      :used_version_for_registration1,
+      :used_version_for_proofreading1,
+      :base_of_original_book_name1,
+      :base_of_original_book_publisher_name1,
+      :base_of_original_book_first_published_date1,
+      # 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
+      :original_book_name2,
+      :original_book_publisher_name2,
+      :original_book_first_published_date2,
+      :used_version_for_registration2,
+      :used_version_for_proofreading2,
+      :base_of_original_book_name2,
+      :base_of_original_book_publisher_name2,
+      :base_of_original_book_first_published_date2,
+      # 入力者,校正者,
+      :registered_person_name,
+      :proofreader_name,
+      # テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
+      :text_file_url,
+      :last_text_file_updated_date,
+      :text_file_character_encoding,
+      :text_file_character_set,
+      :text_file_updating_count,
+      # XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
+      :html_file_url,
+      :last_html_file_updated_date,
+      :html_file_character_encoding,
+      :html_file_character_set,
+      :html_file_updating_count
+    )
+    class Book
+      attr_writer :cache_path
+      def initialize(*args)
+        super
+        @text = nil
+        @html = nil
+        @cache_path = nil
+      end
+      alias_method :copyrighted?, :copyrighted
+      alias_method :person_copyrighted?, :person_copyrighted
+      def text
+        return @text unless @text.nil?
+        return @text if text_file_url.nil? || text_file_url.empty?
+        # when url is not zip file, it needs to open web page by brower and has to download
+        # e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
+        return @text unless text_file_url.end_with?('.zip')
+        downloader = Downloader.new(text_file_url)
+        downloader.download(text_file_output_path)
+        @text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
+          input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
+        end
+        @text
+      end
+      def html
+        return @html unless @html.nil?
+        return @html if html_file_url.nil? || html_file_url.empty?
+        downloader = Downloader.new(html_file_url)
+        downloader.download(html_file_output_path)
+        @html = File.read(html_file_output_path).encode(Encoding::UTF_8,
+                                                        normalize_encoding(html_file_character_encoding))
+        @html
+      end
+      private
+      def text_file_output_path
+        cache_base_dir + text_file_name
+      end
+      def html_file_output_path
+        cache_base_dir + html_file_name
+      end
+      def text_file_name
+        text_file_url.split('/').last
+      end
+      def html_file_name
+        html_file_url.split('/').last
+      end
+      def cache_base_dir
+        @cache_path.base_dir + title_id + person_id
+      end
+      def normalize_encoding(encoding)
+        case encoding
+        when 'ShiftJIS'
+          Encoding::Shift_JIS
+        when 'UTF-8'
+          Encoding::UTF_8
+        else
+          encoding
+        end
+      end
+    end
+    def initialize
+      super()
+      @metadata.id = 'aozora-bunko'
+      @metadata.name = 'Aozora Bunko'
+      @metadata.url = 'https://www.aozora.gr.jp/'
+      @metadata.licenses = 'CC-BY-2.1-JP'
+      @metadata.description = <<~DESCRIPTION
+        Aozora Bunko is an activity to collect free electronic books that anyone can access
+        on the Internet like a library. The copyrighted works and the works that are said to be
+        "free to read" are available after being digitized in text and XHTML (some HTML) formats.
+      DESCRIPTION
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv_file_stream|
+        text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
+        CSV.parse(text, headers: true) do |row|
+          %w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
+            row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
+          end
+          book = Book.new(*row.fields)
+          book.cache_path = cache_path
+          yield(book)
+        end
+      end
+    end
+    private
+    def open_data(&block)
+      data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
+      data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
+      download(data_path, data_url)
+      ZipExtractor.new(data_path).extract_first_file do |input|
+        block.call(input)
+      end
+    end
+    def normalize_boolean(column_value)
+      column_value == 'あり'
+    end
+  end
+end

data/lib/datasets/cache-path.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module Datasets
+  class CachePath
+    def initialize(id)
+      @id = id
+    end
+    def base_dir
+      Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
+    end
+    def remove
+      FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
+    end
+    private
+    def system_cache_dir
+      case RUBY_PLATFORM
+      when /mswin/, /mingw/
+        ENV['LOCALAPPDATA'] || '~/AppData/Local'
+      when /darwin/
+        '~/Library/Caches'
+      else
+        ENV['XDG_CACHE_HOME'] || '~/.cache'
+      end
+    end
+  end
+end

data/lib/datasets/california-housing.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require "csv"
+require_relative 'zip-extractor'
+module Datasets
+  class CaliforniaHousing < Dataset
+    Record = Struct.new(:median_house_value,
+                        :median_income,
+                        :housing_median_age,
+                        :total_rooms,
+                        :total_bedrooms,
+                        :population,
+                        :households,
+                        :latitude,
+                        :longitude)
+    def initialize
+      super()
+      @metadata.id = "california-housing"
+      @metadata.name = "California Housing"
+      @metadata.url = "http://lib.stat.cmu.edu/datasets/"
+      @metadata.licenses = ["CCO"]
+      @metadata.description = <<-DESCRIPTION
+Housing information from the 1990 census used in
+Pace, R. Kelley and Ronald Barry,
+"Sparse Spatial Autoregressions",
+Statistics and Probability Letters, 33 (1997) 291-297.
+Available from http://lib.stat.cmu.edu/datasets/.
+      DESCRIPTION
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      data_path = cache_dir_path + "houses.zip"
+      data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
+      file_name = "cadata.txt"
+      download(data_path, data_url)
+      open_data(data_path, file_name) do |input|
+        data = ""
+        input.each_line do |line|
+          next unless line.start_with?(" ")
+          data << line.lstrip.gsub(/ +/, ",")
+        end
+        options = {
+          converters: [:numeric],
+        }
+        CSV.parse(data, **options) do |row|
+          yield(Record.new(*row))
+        end
+      end
+    end
+    private
+    def open_data(data_path, file_name)
+      ZipExtractor.new(data_path).extract_first_file do |input|
+        yield input
+      end
+    end
+  end
+end

data/lib/datasets/cifar.rb CHANGED Viewed

@@ -50,10 +50,8 @@ module Datasets
       return to_enum(__method__) unless block_given?
       data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
-      unless data_path.exist?
-        data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
-        download(data_path, data_url)
-      end
+      data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
+      download(data_path, data_url)
       parse_data(data_path, &block)
     end

data/lib/datasets/cldr-plurals.rb CHANGED Viewed

@@ -42,10 +42,8 @@ module Datasets
     private
     def open_data
       data_path = cache_dir_path + "plurals.xml"
-      unless data_path.exist?
-        download(data_path, @metadata.url)
-      end
-      ::File.open(data_path) do |input|
+      download(data_path, @metadata.url)
+      data_path.open do |input|
         yield(input)
       end
     end

data/lib/datasets/communities.rb CHANGED Viewed

@@ -140,6 +140,7 @@ module Datasets
       @metadata.id = "communities"
       @metadata.name = "Communities"
       @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
+      @metadata.licenses = ["CC-BY-4.0"]
       @metadata.description = lambda do
         read_names
       end
@@ -177,10 +178,8 @@ module Datasets
     def open_data
       data_path = cache_dir_path + "communities.data"
-      unless data_path.exist?
-        data_url = "#{base_url}/communities.data"
-        download(data_path, data_url)
-      end
+      data_url = "#{base_url}/communities.data"
+      download(data_path, data_url)
       CSV.open(data_path) do |csv|
         yield(csv)
       end
@@ -188,10 +187,8 @@ module Datasets
     def read_names
       names_path = cache_dir_path + "communities.names"
-      unless names_path.exist?
-        names_url = "#{base_url}/communities.names"
-        download(names_path, names_url)
-      end
+      names_url = "#{base_url}/communities.names"
+      download(names_path, names_url)
       names_path.read
     end
   end