RubyGems - red-datasets - Versions diffs - 0.1.4 → 0.1.6 - Mend

red-datasets 0.1.4 → 0.1.6

Files changed (74) hide show

checksums.yaml +4 -4
data/README.md +23 -3
data/Rakefile +56 -1
data/doc/text/news.md +102 -0
data/lib/datasets/adult.rb +6 -9
data/lib/datasets/afinn.rb +48 -0
data/lib/datasets/aozora-bunko.rb +196 -0
data/lib/datasets/cache-path.rb +28 -0
data/lib/datasets/california-housing.rb +60 -0
data/lib/datasets/cifar.rb +2 -4
data/lib/datasets/cldr-plurals.rb +2 -4
data/lib/datasets/communities.rb +5 -8
data/lib/datasets/dataset.rb +58 -23
data/lib/datasets/diamonds.rb +26 -0
data/lib/datasets/downloader.rb +110 -30
data/lib/datasets/e-stat-japan.rb +2 -1
data/lib/datasets/fashion-mnist.rb +4 -0
data/lib/datasets/fuel-economy.rb +35 -0
data/lib/datasets/geolonia.rb +67 -0
data/lib/datasets/ggplot2-dataset.rb +79 -0
data/lib/datasets/hepatitis.rb +5 -8
data/lib/datasets/iris.rb +5 -8
data/lib/datasets/ita-corpus.rb +57 -0
data/lib/datasets/kuzushiji-mnist.rb +16 -0
data/lib/datasets/lazy.rb +90 -0
data/lib/datasets/libsvm-dataset-list.rb +5 -8
data/lib/datasets/libsvm.rb +3 -4
data/lib/datasets/license.rb +26 -0
data/lib/datasets/livedoor-news.rb +80 -0
data/lib/datasets/metadata.rb +14 -0
data/lib/datasets/mnist.rb +7 -7
data/lib/datasets/mushroom.rb +5 -8
data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
data/lib/datasets/penguins.rb +6 -8
data/lib/datasets/penn-treebank.rb +2 -4
data/lib/datasets/pmjt-dataset-list.rb +67 -0
data/lib/datasets/postal-code-japan.rb +2 -6
data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
data/lib/datasets/seaborn.rb +90 -0
data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
data/lib/datasets/wikipedia.rb +16 -8
data/lib/datasets/wine.rb +6 -9
data/lib/datasets/zip-extractor.rb +48 -0
data/lib/datasets.rb +2 -22
data/red-datasets.gemspec +1 -1
data/test/helper.rb +21 -0
data/test/test-afinn.rb +60 -0
data/test/test-aozora-bunko.rb +190 -0
data/test/test-california-housing.rb +56 -0
data/test/test-cldr-plurals.rb +1 -1
data/test/test-dataset.rb +15 -7
data/test/test-diamonds.rb +71 -0
data/test/test-fuel-economy.rb +75 -0
data/test/test-geolonia.rb +65 -0
data/test/test-ita-corpus.rb +69 -0
data/test/test-kuzushiji-mnist.rb +137 -0
data/test/test-license.rb +24 -0
data/test/test-livedoor-news.rb +351 -0
data/test/test-metadata.rb +36 -0
data/test/test-nagoya-university-conversation-corpus.rb +132 -0
data/test/test-penguins.rb +1 -1
data/test/test-pmjt-dataset-list.rb +50 -0
data/test/test-quora-duplicate-question-pair.rb +33 -0
data/test/test-rdataset.rb +246 -0
data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
data/test/test-sudachi-synonym-dictionary.rb +5 -5
data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
data/test/test-wikipedia.rb +25 -71
metadata +62 -14
data/lib/datasets/seaborn-data.rb +0 -49
data/test/test-rdatasets.rb +0 -136

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8d18fa976f1b368a6a3f9cc85dc7a58a1785fd02901157672484f2a7d8b1fa88
-  data.tar.gz: c91d651a0d8de6722ee759ce29545f5f382d1e9f060c7e4ee5a0fcd557be4d21
+  metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
+  data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
 SHA512:
-  metadata.gz: 0ff7694dd27e4293206de81fc2a7b5ccccb886579ed73eb7f97d390472692ce310993e2ece741cf85f5fbe265f1deb2a7ea326590383b4bdf0d3f77f10b1bbc1
-  data.tar.gz: 38ac6aa12d3e33ab0c26c0750273b60386d90fd4d916776a0d561c3f25a79fa2d7d216ac465842207cd65f62e2fcbd348389e65f905583187fe23c30908d92dc
+  metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
+  data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252

data/README.md CHANGED Viewed

@@ -1,6 +1,5 @@
 # Red Datasets
-[![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
 [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
 ## Description
@@ -17,15 +16,30 @@ You can use datasets easily because you can access each dataset with multiple wa
 ## Available datasets
-TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
 * Adult Dataset
+* Aozora Bunko
+* California Housing
 * CIFAR-10 Dataset
 * CIFAR-100 Dataset
+* CLDR language plural rules
+* Communities and crime
+* Diamonds Dataset
+* E-Stat Japan
 * Fashion-MNIST
+* Fuel Economy Dataset
+* Geolonia Japanese Addresses
+* Hepatitis
 * Iris Dataset
+* Libsvm
 * MNIST database
+* Mushroom
+* Penguins
 * The Penn Treebank Project
+* PMJT - Pre-Modern Japanese Text dataset list
+* Postal Codes in Japan
+* Rdatasets
+* Seaborn
+* Sudachi Synonym Dictionary
 * Wikipedia
 * Wine Dataset
@@ -135,6 +149,12 @@ end
 * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
+## How to develop Red Datasets
+1. Fork https://github.com/red-data-tools/red-datasets
+2. Create a feature branch from master
+3. Develop in the feature branch
+4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
 ## License
 The MIT license. See `LICENSE.txt` for details.

data/Rakefile CHANGED Viewed

@@ -13,9 +13,64 @@ end
 helper.install
 spec = helper.gemspec
+task default: :test
 desc "Run tests"
 task :test do
   ruby("test/run-test.rb")
 end
-task default: :test
+desc "Generate an artifact for GitHub Pages"
+task :pages do
+  pages_dir = "_site"
+  rm_rf(pages_dir)
+  mkdir_p(pages_dir)
+  require "cgi/util"
+  require_relative "lib/datasets/lazy"
+  File.open("#{pages_dir}/index.html", "w") do |index_html|
+    index_html.puts(<<-HTML)
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="UTF-8">
+    <title>Red Datasets</title>
+    <style>
+      table {
+        margin-left: 20vw;
+        min-width: 50%;
+      }
+      th {
+        font-size: 30px;
+        padding: 20px;
+      }
+      td {
+        border-bottom: 1px solid #D9DCE0;
+        padding: 20px;
+        font-weight: bold;
+      }
+    </style>
+  </head>
+  <body>
+    <section>
+      <h1>Red Datasets</h1>
+      <table>
+        <thead>
+          <tr><th>Available datasets</th></tr>
+        </thead>
+        <tbody>
+    HTML
+    Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
+      index_html.puts(<<-HTML)
+          <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
+      HTML
+    end
+    index_html.puts(<<-HTML)
+        </tbody>
+      </table>
+    </section>
+  </body>
+</html>
+    HTML
+  end
+end

data/doc/text/news.md CHANGED Viewed

@@ -1,5 +1,107 @@
 # News
+## 0.1.6 - 2023-05-24
+### Improvements
+  * Added support for lazy loading by `require "datasets/lazy"`.
+  * `Datasets::NagoyaUniversityConversationCorpus`: Added.
+    [GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
+    [Patch by matsuura]
+  * `Datasets::Wikipedia`: Added support for downloading in background.
+### Thanks
+  * matsuura
+## 0.1.5 - 2022-09-22
+### Improvements
+  * `Datasets::PMJTDatasetList`: Added.
+    [GitHub#107][Patch by okadak]
+  * `Datasets::AozoraBunko`: Added.
+    [GitHub#108][Patch by Masa]
+  * Added how to develop to README
+    [GitHub#117][Patch by abcdefg-1234567]
+  * `Datasets::FuelEconomy`: Added.
+    [GitHub#114][Patch by Benson Muite]
+  * `Datasets::Geolonia`: Added.
+    [GitHub#118][Patch by abcdefg-1234567]
+  * `Datasets::Diamonds`: Added.
+    [GitHub#110][Patch by Benson Muite]
+  * `Datasets::ITACorpus`: Added.
+    [GitHub#119][Patch by abcdefg-1234567]
+  * `Datasets::KuzushijiMNIST`: Added.
+    [GitHub#125][Patch by abcdefg-1234567]
+  * Updated list of datasets in README.
+    [GitHub#129][Patch by Benson Muite]
+  * `Datasets::CaliforniaHousing`: Added.
+    [GitHub#123][Patch by Benson Muite]
+  * Added support for Ruby 3.1.
+    [GitHub#130][Patch by Benson Muite]
+  * `Datasets::AFINN`: Added.
+    [GitHub#120][Patch by Benson Muite]
+  * `Datasets::LivedoorNews`: Added.
+    [GitHub#127][Patch by abcdefg-1234567]
+  * `Datasets::SeabornDataList`: Added.
+    [GitHub#134][Patch by Hirokazu SUZUKI]
+  * `Datasets::WikipediaKyotoJapaneseEnglish`: Added.
+    [GitHub#135][Patch by abcdefg-1234567]
+  * Renamed Rdatasets to Rdataset.
+    [GitHub#148][Patch by Hirokazu SUZUKI]
+  * Removed support for Ruby 2.6.
+  * Add missing license information.
+  * `Datasets::QuoraDuplicateQuestionPair`: Added.
+    [GitHub#149][Patch by otegami]
+### Fixes
+  * Fixed key from nil to :index in `Datasets::SeabornData`.
+    [GitHub#133][Patch by Hirokazu SUZUKI]
+  * Fixed `Datasets::Rdatasets#each` to change "NA" to nil.
+    [GitHub#139][Patch by Hirokazu SUZUKI]
+  * Fix `Datasets::Rdatasets#each` with mixed data of numeric and string.
+    [GitHub#140][Patch by Hirokazu SUZUKI]
+### Thanks
+  * okadak
+  * Masa
+  * Benson Muite
+  * abcdefg-1234567
+  * Hirokazu SUZUKI
+  * Sutou Kouhei
+  * otegami
 ## 0.1.4 - 2021-07-13
 ### Improvements

data/lib/datasets/adult.rb CHANGED Viewed

@@ -31,7 +31,8 @@ module Datasets
       @type = type
       @metadata.id = "adult-#{@type}"
       @metadata.name = "Adult: #{@type}"
-      @metadata.url = "http://archive.ics.uci.edu/ml/datasets/adult"
+      @metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
+      @metadata.licenses = ["CC-BY-4.0"]
       @metadata.description = lambda do
         read_names
       end
@@ -58,10 +59,8 @@ module Datasets
         ext = "test"
       end
       data_path = cache_dir_path + "adult-#{ext}.csv"
-      unless data_path.exist?
-        data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
-        download(data_path, data_url)
-      end
+      data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
+      download(data_path, data_url)
       options = {
                  converters: [:numeric, lambda {|f| f.strip}],
@@ -74,10 +73,8 @@ module Datasets
     def read_names
       names_path = cache_dir_path + "adult.names"
-      unless names_path.exist?
-        names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
-        download(names_path, names_url)
-      end
+      names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
+      download(names_path, names_url)
       names_path.read
     end
   end

data/lib/datasets/afinn.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require "csv"
+require_relative "zip-extractor"
+module Datasets
+  class AFINN < Dataset
+    Record = Struct.new(:word,
+                        :valence)
+    def initialize
+      super()
+      @metadata.id = "afinn"
+      @metadata.name = "AFINN"
+      @metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
+      @metadata.licenses = ["ODbL-1.0"]
+      @metadata.description = lambda do
+        extract_file("AFINN/AFINN-README.txt") do |input|
+          readme = input.read
+          readme.force_encoding("UTF-8")
+          readme.
+            gsub(/^AFINN-96:.*?\n\n/m, "").
+            gsub(/^In Python.*$/m, "").
+            strip
+        end
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      extract_file("AFINN/AFINN-111.txt") do |input|
+        csv = CSV.new(input, col_sep: "\t", converters: :numeric)
+        csv.each do |row|
+          yield(Record.new(*row))
+        end
+      end
+    end
+    private
+    def extract_file(file_path, &block)
+      data_path = cache_dir_path + "imm6010.zip"
+      data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
+      download(data_path, data_url)
+      extractor = ZipExtractor.new(data_path)
+      extractor.extract_file(file_path, &block)
+    end
+  end
+end

data/lib/datasets/aozora-bunko.rb ADDED Viewed

@@ -0,0 +1,196 @@
+require_relative 'dataset'
+require_relative 'zip-extractor'
+module Datasets
+  # Dataset for AozoraBunko
+  class AozoraBunko < Dataset
+    Book = Struct.new(
+      # 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
+      :title_id,
+      :title,
+      :title_reading,
+      :title_reading_collation,
+      :subtitle,
+      :subtitle_reading,
+      :original_title,
+      :first_appearance,
+      :ndc_code, # 分類番号(日本十進分類法の番号)
+      :syllabary_spelling_type,
+      :copyrighted,
+      :published_date,
+      :last_updated_date,
+      :detail_url,
+      # 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
+      :person_id,
+      :person_family_name,
+      :person_first_name,
+      :person_family_name_reading,
+      :person_first_name_reading,
+      :person_family_name_reading_collation,
+      :person_first_name_reading_collation,
+      :person_family_name_romaji,
+      :person_first_name_romaji,
+      :person_type,
+      :person_birthday,
+      :person_date_of_death,
+      :person_copyrighted,
+      # 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
+      :original_book_name1,
+      :original_book_publisher_name1,
+      :original_book_first_published_date1,
+      :used_version_for_registration1,
+      :used_version_for_proofreading1,
+      :base_of_original_book_name1,
+      :base_of_original_book_publisher_name1,
+      :base_of_original_book_first_published_date1,
+      # 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
+      :original_book_name2,
+      :original_book_publisher_name2,
+      :original_book_first_published_date2,
+      :used_version_for_registration2,
+      :used_version_for_proofreading2,
+      :base_of_original_book_name2,
+      :base_of_original_book_publisher_name2,
+      :base_of_original_book_first_published_date2,
+      # 入力者,校正者,
+      :registered_person_name,
+      :proofreader_name,
+      # テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
+      :text_file_url,
+      :last_text_file_updated_date,
+      :text_file_character_encoding,
+      :text_file_character_set,
+      :text_file_updating_count,
+      # XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
+      :html_file_url,
+      :last_html_file_updated_date,
+      :html_file_character_encoding,
+      :html_file_character_set,
+      :html_file_updating_count
+    )
+    class Book
+      attr_writer :cache_path
+      def initialize(*args)
+        super
+        @text = nil
+        @html = nil
+        @cache_path = nil
+      end
+      alias_method :copyrighted?, :copyrighted
+      alias_method :person_copyrighted?, :person_copyrighted
+      def text
+        return @text unless @text.nil?
+        return @text if text_file_url.nil? || text_file_url.empty?
+        # when url is not zip file, it needs to open web page by brower and has to download
+        # e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
+        return @text unless text_file_url.end_with?('.zip')
+        downloader = Downloader.new(text_file_url)
+        downloader.download(text_file_output_path)
+        @text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
+          input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
+        end
+        @text
+      end
+      def html
+        return @html unless @html.nil?
+        return @html if html_file_url.nil? || html_file_url.empty?
+        downloader = Downloader.new(html_file_url)
+        downloader.download(html_file_output_path)
+        @html = File.read(html_file_output_path).encode(Encoding::UTF_8,
+                                                        normalize_encoding(html_file_character_encoding))
+        @html
+      end
+      private
+      def text_file_output_path
+        cache_base_dir + text_file_name
+      end
+      def html_file_output_path
+        cache_base_dir + html_file_name
+      end
+      def text_file_name
+        text_file_url.split('/').last
+      end
+      def html_file_name
+        html_file_url.split('/').last
+      end
+      def cache_base_dir
+        @cache_path.base_dir + title_id + person_id
+      end
+      def normalize_encoding(encoding)
+        case encoding
+        when 'ShiftJIS'
+          Encoding::Shift_JIS
+        when 'UTF-8'
+          Encoding::UTF_8
+        else
+          encoding
+        end
+      end
+    end
+    def initialize
+      super()
+      @metadata.id = 'aozora-bunko'
+      @metadata.name = 'Aozora Bunko'
+      @metadata.url = 'https://www.aozora.gr.jp/'
+      @metadata.licenses = 'CC-BY-2.1-JP'
+      @metadata.description = <<~DESCRIPTION
+        Aozora Bunko is an activity to collect free electronic books that anyone can access
+        on the Internet like a library. The copyrighted works and the works that are said to be
+        "free to read" are available after being digitized in text and XHTML (some HTML) formats.
+      DESCRIPTION
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv_file_stream|
+        text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
+        CSV.parse(text, headers: true) do |row|
+          %w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
+            row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
+          end
+          book = Book.new(*row.fields)
+          book.cache_path = cache_path
+          yield(book)
+        end
+      end
+    end
+    private
+    def open_data(&block)
+      data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
+      data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
+      download(data_path, data_url)
+      ZipExtractor.new(data_path).extract_first_file do |input|
+        block.call(input)
+      end
+    end
+    def normalize_boolean(column_value)
+      column_value == 'あり'
+    end
+  end
+end

data/lib/datasets/cache-path.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module Datasets
+  class CachePath
+    def initialize(id)
+      @id = id
+    end
+    def base_dir
+      Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
+    end
+    def remove
+      FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
+    end
+    private
+    def system_cache_dir
+      case RUBY_PLATFORM
+      when /mswin/, /mingw/
+        ENV['LOCALAPPDATA'] || '~/AppData/Local'
+      when /darwin/
+        '~/Library/Caches'
+      else
+        ENV['XDG_CACHE_HOME'] || '~/.cache'
+      end
+    end
+  end
+end

data/lib/datasets/california-housing.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require "csv"
+require_relative 'zip-extractor'
+module Datasets
+  class CaliforniaHousing < Dataset
+    Record = Struct.new(:median_house_value,
+                        :median_income,
+                        :housing_median_age,
+                        :total_rooms,
+                        :total_bedrooms,
+                        :population,
+                        :households,
+                        :latitude,
+                        :longitude)
+    def initialize
+      super()
+      @metadata.id = "california-housing"
+      @metadata.name = "California Housing"
+      @metadata.url = "http://lib.stat.cmu.edu/datasets/"
+      @metadata.licenses = ["CCO"]
+      @metadata.description = <<-DESCRIPTION
+Housing information from the 1990 census used in
+Pace, R. Kelley and Ronald Barry,
+"Sparse Spatial Autoregressions",
+Statistics and Probability Letters, 33 (1997) 291-297.
+Available from http://lib.stat.cmu.edu/datasets/.
+      DESCRIPTION
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      data_path = cache_dir_path + "houses.zip"
+      data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
+      file_name = "cadata.txt"
+      download(data_path, data_url)
+      open_data(data_path, file_name) do |input|
+        data = ""
+        input.each_line do |line|
+          next unless line.start_with?(" ")
+          data << line.lstrip.gsub(/ +/, ",")
+        end
+        options = {
+          converters: [:numeric],
+        }
+        CSV.parse(data, **options) do |row|
+          yield(Record.new(*row))
+        end
+      end
+    end
+    private
+    def open_data(data_path, file_name)
+      ZipExtractor.new(data_path).extract_first_file do |input|
+        yield input
+      end
+    end
+  end
+end

data/lib/datasets/cifar.rb CHANGED Viewed

@@ -50,10 +50,8 @@ module Datasets
       return to_enum(__method__) unless block_given?
       data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
-      unless data_path.exist?
-        data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
-        download(data_path, data_url)
-      end
+      data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
+      download(data_path, data_url)
       parse_data(data_path, &block)
     end

data/lib/datasets/cldr-plurals.rb CHANGED Viewed

@@ -42,10 +42,8 @@ module Datasets
     private
     def open_data
       data_path = cache_dir_path + "plurals.xml"
-      unless data_path.exist?
-        download(data_path, @metadata.url)
-      end
-      ::File.open(data_path) do |input|
+      download(data_path, @metadata.url)
+      data_path.open do |input|
         yield(input)
       end
     end

data/lib/datasets/communities.rb CHANGED Viewed

@@ -140,6 +140,7 @@ module Datasets
       @metadata.id = "communities"
       @metadata.name = "Communities"
       @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
+      @metadata.licenses = ["CC-BY-4.0"]
       @metadata.description = lambda do
         read_names
       end
@@ -177,10 +178,8 @@ module Datasets
     def open_data
       data_path = cache_dir_path + "communities.data"
-      unless data_path.exist?
-        data_url = "#{base_url}/communities.data"
-        download(data_path, data_url)
-      end
+      data_url = "#{base_url}/communities.data"
+      download(data_path, data_url)
       CSV.open(data_path) do |csv|
         yield(csv)
       end
@@ -188,10 +187,8 @@ module Datasets
     def read_names
       names_path = cache_dir_path + "communities.names"
-      unless names_path.exist?
-        names_url = "#{base_url}/communities.names"
-        download(names_path, names_url)
-      end
+      names_url = "#{base_url}/communities.names"
+      download(names_path, names_url)
       names_path.read
     end
   end