RubyGems - red-datasets - Versions diffs - 0.1.5 → 0.1.6 - Mend

red-datasets 0.1.5 → 0.1.6

Files changed (20) hide show

checksums.yaml +4 -4
data/README.md +0 -1
data/Rakefile +56 -1
data/doc/text/news.md +16 -0
data/lib/datasets/dataset.rb +50 -11
data/lib/datasets/downloader.rb +110 -35
data/lib/datasets/lazy.rb +90 -0
data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
data/lib/datasets/penguins.rb +2 -0
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wikipedia.rb +14 -5
data/lib/datasets/zip-extractor.rb +12 -0
data/lib/datasets.rb +2 -34
data/test/test-geolonia.rb +10 -9
data/test/test-nagoya-university-conversation-corpus.rb +132 -0
data/test/test-rdataset.rb +2 -2
data/test/test-seaborn.rb +1 -0
data/test/test-sudachi-synonym-dictionary.rb +3 -3
data/test/test-wikipedia.rb +25 -71
metadata +7 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c1cfd18b589e4624178d9010ef68a100bb6e2573ccf18a9f96168af786523578
-  data.tar.gz: 67eddd22e10bf78c0b2cf10b18de289368d473d7b5ddf2a557cc2264834e32b0
+  metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
+  data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
 SHA512:
-  metadata.gz: 111243d3a1d3d758196bb71301ccb0f34beb1f5bec7c5c14b15f7c96fd6bdde924e30d90d3ace9e9258074411c9f7e7b4ef6bd9338dc5c11349534b2392f6f81
-  data.tar.gz: 9a9b426c753bd7e6cc12d452d61b90c2422fcad3b3c353a552c5c05a7c7fd53c3d4ac9cec2e33af1537d9e76e04f1df3d6d9b4baf043528fdde2ab4f9f203e9f
+  metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
+  data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252

data/README.md CHANGED Viewed

@@ -1,6 +1,5 @@
 # Red Datasets
-[![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
 [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
 ## Description

data/Rakefile CHANGED Viewed

@@ -13,9 +13,64 @@ end
 helper.install
 spec = helper.gemspec
+task default: :test
 desc "Run tests"
 task :test do
   ruby("test/run-test.rb")
 end
-task default: :test
+desc "Generate an artifact for GitHub Pages"
+task :pages do
+  pages_dir = "_site"
+  rm_rf(pages_dir)
+  mkdir_p(pages_dir)
+  require "cgi/util"
+  require_relative "lib/datasets/lazy"
+  File.open("#{pages_dir}/index.html", "w") do |index_html|
+    index_html.puts(<<-HTML)
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="UTF-8">
+    <title>Red Datasets</title>
+    <style>
+      table {
+        margin-left: 20vw;
+        min-width: 50%;
+      }
+      th {
+        font-size: 30px;
+        padding: 20px;
+      }
+      td {
+        border-bottom: 1px solid #D9DCE0;
+        padding: 20px;
+        font-weight: bold;
+      }
+    </style>
+  </head>
+  <body>
+    <section>
+      <h1>Red Datasets</h1>
+      <table>
+        <thead>
+          <tr><th>Available datasets</th></tr>
+        </thead>
+        <tbody>
+    HTML
+    Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
+      index_html.puts(<<-HTML)
+          <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
+      HTML
+    end
+    index_html.puts(<<-HTML)
+        </tbody>
+      </table>
+    </section>
+  </body>
+</html>
+    HTML
+  end
+end

data/doc/text/news.md CHANGED Viewed

@@ -1,5 +1,21 @@
 # News
+## 0.1.6 - 2023-05-24
+### Improvements
+  * Added support for lazy loading by `require "datasets/lazy"`.
+  * `Datasets::NagoyaUniversityConversationCorpus`: Added.
+    [GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
+    [Patch by matsuura]
+  * `Datasets::Wikipedia`: Added support for downloading in background.
+### Thanks
+  * matsuura
 ## 0.1.5 - 2022-09-22
 ### Improvements

data/lib/datasets/dataset.rb CHANGED Viewed

@@ -33,20 +33,59 @@ module Datasets
       @cache_path ||= CachePath.new(@metadata.id)
     end
-    def download(output_path, url)
+    def download(output_path, url, &block)
       downloader = Downloader.new(url)
-      downloader.download(output_path)
+      downloader.download(output_path, &block)
     end
-    def extract_bz2(path)
-      input, output = IO.pipe
-      pid = spawn("bzcat", path.to_s, {:out => output})
-      begin
-        output.close
-        yield(input)
-      ensure
-        input.close
-        Process.waitpid(pid)
+    def extract_bz2(bz2)
+      case bz2
+      when Pathname, String
+        IO.pipe do |input, output|
+          pid = spawn("bzcat", bz2.to_s, {out: output})
+          begin
+            output.close
+            yield(input)
+          ensure
+            input.close
+            Process.waitpid(pid)
+          end
+        end
+      else
+        IO.pipe do |bz2_input, bz2_output|
+          IO.pipe do |plain_input, plain_output|
+            bz2_stop = false
+            bz2_thread = Thread.new do
+              begin
+                bz2.each do |chunk|
+                  bz2_output.write(chunk)
+                  bz2_output.flush
+                  break if bz2_stop
+                end
+              rescue => error
+                message = "Failed to read bzcat input: " +
+                          "#{error.class}: #{error.message}"
+                $stderr.puts(message)
+              ensure
+                bz2_output.close
+              end
+            end
+            begin
+              pid = spawn("bzcat", {in: bz2_input, out: plain_output})
+              begin
+                bz2_input.close
+                plain_output.close
+                yield(plain_input)
+              ensure
+                plain_input.close
+                Process.waitpid(pid)
+              end
+            ensure
+              bz2_stop = true
+              bz2_thread.join
+            end
+          end
+        end
       end
     end
   end

data/lib/datasets/downloader.rb CHANGED Viewed

@@ -22,50 +22,115 @@ module Datasets
       end
     end
-    def download(output_path)
-      return if output_path.exist?
-      output_path.parent.mkpath
+    def download(output_path, &block)
+      if output_path.exist?
+        yield_chunks(output_path, &block) if block_given?
+        return
+      end
-      headers = {
-        "Accept-Encoding" => "identity",
-        "User-Agent" => "Red Datasets/#{VERSION}",
-      }
-      start = nil
       partial_output_path = Pathname.new("#{output_path}.partial")
-      if partial_output_path.exist?
-        start = partial_output_path.size
-        headers["Range"] = "bytes=#{start}-"
-      end
+      synchronize(output_path, partial_output_path) do
+        output_path.parent.mkpath
-      start_http(@url, headers) do |response|
-        if response.is_a?(Net::HTTPPartialContent)
-          mode = "ab"
-        else
+        n_retries = 0
+        n_max_retries = 5
+        begin
+          headers = {
+            "Accept-Encoding" => "identity",
+            "User-Agent" => "Red Datasets/#{VERSION}",
+          }
           start = nil
-          mode = "wb"
-        end
+          if partial_output_path.exist?
+            start = partial_output_path.size
+            headers["Range"] = "bytes=#{start}-"
+          end
+          start_http(@url, headers) do |response|
+            if response.is_a?(Net::HTTPPartialContent)
+              mode = "ab"
+            else
+              start = nil
+              mode = "wb"
+            end
-        base_name = @url.path.split("/").last
-        size_current = 0
-        size_max = response.content_length
-        if start
-          size_current += start
-          size_max += start
+            base_name = @url.path.split("/").last
+            size_current = 0
+            size_max = response.content_length
+            if start
+              size_current += start
+              size_max += start
+              if block_given? and n_retries.zero?
+                yield_chunks(partial_output_path, &block)
+              end
+            end
+            progress_reporter = ProgressReporter.new(base_name, size_max)
+            partial_output_path.open(mode) do |output|
+              response.read_body do |chunk|
+                size_current += chunk.bytesize
+                progress_reporter.report(size_current)
+                output.write(chunk)
+                yield(chunk) if block_given?
+              end
+            end
+          end
+          FileUtils.mv(partial_output_path, output_path)
+        rescue Net::ReadTimeout => error
+          n_retries += 1
+          retry if n_retries < n_max_retries
+          raise
+        rescue TooManyRedirects => error
+          last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
+          raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
         end
-        progress_reporter = ProgressReporter.new(base_name, size_max)
-        partial_output_path.open(mode) do |output|
-          response.read_body do |chunk|
-            size_current += chunk.bytesize
-            progress_reporter.report(size_current)
-            output.write(chunk)
+      end
+    end
+    private def synchronize(output_path, partial_output_path)
+      begin
+        Process.getpgid(Process.pid)
+      rescue NotImplementedError
+        return yield
+      end
+      lock_path = Pathname("#{output_path}.lock")
+      loop do
+        lock_path.parent.mkpath
+        begin
+          lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
+        rescue SystemCallError
+          valid_lock_path = true
+          begin
+            pid = Integer(lock_path.read.chomp, 10)
+          rescue ArgumentError
+            # The process that acquired the lock will be exited before
+            # it stores its process ID.
+            valid_lock_path = (lock_path.mtime > 10)
+          else
+            begin
+              Process.getpgid(pid)
+            rescue SystemCallError
+              # Process that acquired the lock doesn't exist
+              valid_lock_path = false
+            end
+          end
+          if valid_lock_path
+            sleep(1 + rand(10))
+          else
+            lock_path.delete
           end
+          retry
+        else
+          begin
+            lock.puts(Process.pid.to_s)
+            lock.flush
+            yield
+          ensure
+            lock.close
+            lock_path.delete
+          end
+          break
         end
       end
-      FileUtils.mv(partial_output_path, output_path)
-    rescue TooManyRedirects => error
-      last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
-      raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
     end
     private def start_http(url, headers, limit = 10, &block)
@@ -99,6 +164,16 @@ module Datasets
       end
     end
+    private def yield_chunks(path)
+      path.open("rb") do |output|
+        chunk_size = 1024 * 1024
+        chunk = ""
+        while output.read(chunk_size, chunk)
+          yield(chunk)
+        end
+      end
+    end
     class ProgressReporter
       def initialize(base_name, size_max)
         @base_name = base_name

data/lib/datasets/lazy.rb ADDED Viewed

@@ -0,0 +1,90 @@
+require_relative "version"
+module Datasets
+  class LazyLoader
+    def initialize
+      @constants = {}
+    end
+    def exist?(constant_name)
+      @constants.key?(constant_name)
+    end
+    def load(constant_name)
+      feature = @constants[constant_name]
+      raise LoadError, "unknown dataset: #{constant_name}" unless feature
+      require feature
+    end
+    def load_all
+      @constants.each_value do |feature|
+        require feature
+      end
+    end
+    def register(constant_name, feature)
+      @constants[constant_name] = feature
+    end
+    def constant_names
+      @constants.keys
+    end
+  end
+  LAZY_LOADER = LazyLoader.new
+  class << self
+    def const_missing(name)
+      if LAZY_LOADER.exist?(name)
+        LAZY_LOADER.load(name)
+        const_get(name)
+      else
+        super
+      end
+    end
+  end
+  LAZY_LOADER.register(:Adult, "datasets/adult")
+  LAZY_LOADER.register(:AFINN, "datasets/afinn")
+  LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
+  LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
+  LAZY_LOADER.register(:CIFAR, "datasets/cifar")
+  LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
+  LAZY_LOADER.register(:Communities, "datasets/communities")
+  LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
+  LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
+  LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
+  LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
+  LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
+  LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
+  LAZY_LOADER.register(:Iris, "datasets/iris")
+  LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
+  LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
+  LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
+  LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
+  LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
+  LAZY_LOADER.register(:MNIST, "datasets/mnist")
+  LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
+  LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
+                       "datasets/nagoya-university-conversation-corpus")
+  LAZY_LOADER.register(:Penguins, "datasets/penguins")
+  LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
+  LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
+  LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
+  LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
+                       "datasets/quora-duplicate-question-pair")
+  LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
+  # For backward compatibility
+  LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
+  LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
+  # For backward compatibility
+  LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
+  LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
+  LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
+  LAZY_LOADER.register(:SudachiSynonymDictionary,
+                       "datasets/sudachi-synonym-dictionary")
+  LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
+  LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
+                       "datasets/wikipedia-kyoto-japanese-english")
+  LAZY_LOADER.register(:Wine, "datasets/wine")
+end

data/lib/datasets/nagoya-university-conversation-corpus.rb ADDED Viewed

@@ -0,0 +1,109 @@
+require_relative 'dataset'
+require_relative 'zip-extractor'
+module Datasets
+  class NagoyaUniversityConversationCorpus < Dataset
+    Data = Struct.new(
+      :name,
+      :date,
+      :place,
+      :participants,
+      :relationships,
+      :note,
+      :sentences
+    )
+    Participant = Struct.new(
+      :id,
+      :attribute,
+      :birthplace,
+      :residence
+    )
+    Sentence = Struct.new(:participant_id, :content) do
+      def end?
+        participant_id.nil? and content.nil?
+      end
+    end
+    def initialize
+      super()
+      @metadata.id = 'nagoya-university-conversation-curpus'
+      @metadata.name = 'Nagoya University Conversation Curpus'
+      @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
+      @metadata.licenses = ['CC-BY-NC-ND-4.0']
+      @metadata.description = <<~DESCRIPTION
+        The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
+        total about 100 hours of chatting among native speakers of Japanese,
+        which is converted into text.
+      DESCRIPTION
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |input_stream|
+        yield(parse_file(input_stream))
+      end
+    end
+    private
+    def open_data
+      data_path = cache_dir_path + 'nucc.zip'
+      data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
+      download(data_path, data_url)
+      extractor = ZipExtractor.new(data_path)
+      extractor.extract_files do |input_stream|
+        yield(input_stream)
+      end
+    end
+    def parse_file(input_stream)
+      data = Data.new
+      participants = []
+      sentences = []
+      input_stream.each do |input|
+        input.each_line(chomp: true) do |line|
+          line.force_encoding('utf-8')
+          if line.start_with?('＠データ')
+            data.name = line[4..]
+          elsif line.start_with?('＠収集年月日')
+            # mixed cases with and without'：'
+            data.date = line[6..].delete_prefix('：')
+          elsif line.start_with?('＠場所')
+            data.place = line[4..]
+          elsif line.start_with?('＠参加者の関係')
+            data.relationships = line.split('：', 2)[1]
+          elsif line.start_with?('＠参加者')
+            participant = Participant.new
+            participant.id, profiles = line[4..].split('：', 2)
+            participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
+            participants << participant
+          elsif line.start_with?('％ｃｏｍ')
+            data.note = line.split('：', 2)[1]
+          elsif line == '＠ＥＮＤ'
+            sentence = Sentence.new
+            sentence.participant_id = nil
+            sentence.content = nil
+            sentences << sentence
+          else
+            sentence = Sentence.new
+            sentence.participant_id, sentence.content = line.split('：', 2)
+            sentences << sentence
+          end
+        end
+      end
+      data.participants = participants
+      data.sentences = sentences
+      data
+    end
+  end
+end

data/lib/datasets/penguins.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require "csv"
 require_relative "dataset"
 module Datasets

data/lib/datasets/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Datasets
-  VERSION = "0.1.5"
+  VERSION = "0.1.6"
 end

data/lib/datasets/wikipedia.rb CHANGED Viewed

@@ -53,13 +53,22 @@ module Datasets
     end
     private
+    def base_name
+      "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
+    end
+    def data_path
+      cache_dir_path + base_name
+    end
     def open_data(&block)
-      base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
-      data_path = cache_dir_path + base_name
       data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
-      download(data_path, data_url)
-      extract_bz2(data_path, &block)
+      bz2 = Enumerator.new do |yielder|
+        download(data_path, data_url) do |bz2_chunk|
+          yielder << bz2_chunk
+        end
+      end
+      extract_bz2(bz2, &block)
     end
     def type_in_path

data/lib/datasets/zip-extractor.rb CHANGED Viewed

@@ -32,5 +32,17 @@ module Datasets
       end
       nil
     end
+    def extract_files
+      Zip::File.open(@path) do |zip_file|
+        zip_file.each do |entry|
+          next unless entry.file?
+          entry.get_input_stream do |input|
+            yield(input)
+          end
+        end
+      end
+    end
   end
 end

data/lib/datasets.rb CHANGED Viewed

@@ -1,34 +1,2 @@
-require_relative "datasets/version"
-require_relative "datasets/adult"
-require_relative "datasets/afinn"
-require_relative "datasets/aozora-bunko"
-require_relative "datasets/california-housing"
-require_relative "datasets/cifar"
-require_relative "datasets/cldr-plurals"
-require_relative "datasets/communities"
-require_relative "datasets/diamonds"
-require_relative "datasets/e-stat-japan"
-require_relative "datasets/fashion-mnist"
-require_relative "datasets/fuel-economy"
-require_relative "datasets/geolonia"
-require_relative "datasets/hepatitis"
-require_relative "datasets/iris"
-require_relative "datasets/ita-corpus"
-require_relative "datasets/kuzushiji-mnist"
-require_relative "datasets/libsvm"
-require_relative "datasets/libsvm-dataset-list"
-require_relative "datasets/livedoor-news"
-require_relative "datasets/mnist"
-require_relative "datasets/mushroom"
-require_relative "datasets/penguins"
-require_relative "datasets/penn-treebank"
-require_relative "datasets/pmjt-dataset-list"
-require_relative "datasets/postal-code-japan"
-require_relative "datasets/quora-duplicate-question-pair"
-require_relative "datasets/rdataset"
-require_relative "datasets/seaborn"
-require_relative "datasets/sudachi-synonym-dictionary"
-require_relative "datasets/wikipedia"
-require_relative "datasets/wikipedia-kyoto-japanese-english"
-require_relative "datasets/wine"
+require_relative "datasets/lazy"
+Datasets::LAZY_LOADER.load_all

data/test/test-geolonia.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class GeoloniaTest < Test::Unit::TestCase
   test('#each') do
     records = @dataset.each.to_a
     assert_equal([
-                   277191,
+                   277616,
                    {
                      :prefecture_code => "01",
                      :prefecture_name => "北海道",
@@ -28,16 +28,16 @@ class GeoloniaTest < Test::Unit::TestCase
                      :prefecture_name => "沖縄県",
                      :prefecture_kana => "オキナワケン",
                      :prefecture_romaji => "OKINAWA KEN",
-                     :municipality_code => "47325",
-                     :municipality_name => "中頭郡嘉手納町",
-                     :municipality_kana => "ナカガミグンカデナチョウ",
-                     :municipality_romaji => "NAKAGAMI GUN KADENA CHO",
-                     :street_name => "字兼久",
+                     :municipality_code => "47382",
+                     :municipality_name => "八重山郡与那国町",
+                     :municipality_kana => "ヤエヤマグンヨナグニチョウ",
+                     :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
+                     :street_name => "字与那国",
                      :street_kana => nil,
                      :street_romaji => nil,
-                     :alias => "下原",
-                     :latitude => "26.351841",
-                     :longitude => "127.744975",
+                     :alias => nil,
+                     :latitude => "24.455925",
+                     :longitude => "122.987678",
                    },
                  ],
                  [
@@ -55,6 +55,7 @@ class GeoloniaTest < Test::Unit::TestCase
                      "## 住所データ仕様",
                      "### ファイルフォーマット",
                      "### 列",
+                     "### ソート順",
                    ],
                    description.scan(/^#.*$/),
                    description)

data/test/test-nagoya-university-conversation-corpus.rb ADDED Viewed

@@ -0,0 +1,132 @@
+class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
+  def setup
+    @dataset = Datasets::NagoyaUniversityConversationCorpus.new
+  end
+  sub_test_case("each") do
+    test("#sentences") do
+      records = @dataset.each.to_a
+      first_sentences = records[0].sentences
+      last_sentences = records[-1].sentences
+      assert_equal([
+                     856,
+                     {
+                       participant_id: 'F107',
+                       content: '＊＊＊の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても１時間ぐらいですよね。'
+                     },
+                     {
+                       participant_id: nil,
+                       content: nil
+                     },
+                     603,
+                     {
+                       participant_id: 'F007',
+                       content: 'それでは話を始めまーす。'
+                     },
+                     {
+                       participant_id: nil,
+                       content: nil
+                     }
+                   ],
+                   [
+                     first_sentences.size,
+                     first_sentences[0].to_h,
+                     first_sentences[-1].to_h,
+                     last_sentences.size,
+                     last_sentences[0].to_h,
+                     last_sentences[-1].to_h,
+                   ])
+    end
+    test("#participants") do
+      records = @dataset.each.to_a
+      first_participants = records[0].participants
+      last_participants = records[-1].participants
+      assert_equal([
+                     4,
+                     {
+                       id: 'F107',
+                       attribute: '女性３０代後半',
+                       birthplace: '愛知県幡豆郡出身',
+                       residence: '愛知県幡豆郡在住'
+                     },
+                     {
+                       id: 'F128',
+                       attribute: '女性２０代前半',
+                       birthplace: '愛知県西尾市出身',
+                       residence: '西尾市在住'
+                     },
+                     2,
+                     {
+                       id: 'F007',
+                       attribute: '女性５０代後半',
+                       birthplace: '東京都出身',
+                       residence: '東京都国分寺市在住'
+                     },
+                     {
+                       id: 'F003',
+                       attribute: '女性８０代後半',
+                       birthplace: '栃木県宇都宮市出身',
+                       residence: '国分寺市在住'
+                     }
+                   ],
+                   [
+                     first_participants.size,
+                     first_participants[0].to_h,
+                     first_participants[-1].to_h,
+                     last_participants.size,
+                     last_participants[0].to_h,
+                     last_participants[-1].to_h
+                   ])
+    end
+    test("others") do
+      records = @dataset.each.to_a
+      assert_equal([
+                     129,
+                     [
+                       '１（約３５分）',
+                       '２００１年１０月１６日',
+                       'ファミリーレストラン',
+                       '英会話教室の友人',
+                       nil
+                     ],
+                     [
+                       '１２９（３６分）',
+                       '２００３年２月１６日',
+                       '二人の自宅',
+                       '母と娘',
+                       'F007は東京に３８年、F003は東京に６０年居住。'
+                    ]
+                   ],
+                   [
+                     records.size,
+                     [
+                       records[0].name,
+                       records[0].date,
+                       records[0].place,
+                       records[0].relationships,
+                       records[0].note
+                     ],
+                     [
+                       records[-1].name,
+                       records[-1].date,
+                       records[-1].place,
+                       records[-1].relationships,
+                       records[-1].note
+                     ]
+                   ])
+    end
+  end
+  sub_test_case("#metadata") do
+    test("#description") do
+      description = @dataset.metadata.description
+      assert_equal(<<~DESCRIPTION, description)
+        The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
+        total about 100 hours of chatting among native speakers of Japanese,
+        which is converted into text.
+      DESCRIPTION
+    end
+  end
+end

data/test/test-rdataset.rb CHANGED Viewed

@@ -8,7 +8,7 @@ class RdatasetTest < Test::Unit::TestCase
       test("with package_name") do
         records = @dataset.filter(package: "datasets").to_a
         assert_equal([
-                       84,
+                       102,
                        {
                          package: "datasets",
                          dataset: "ability.cov",
@@ -48,7 +48,7 @@ class RdatasetTest < Test::Unit::TestCase
       test("without package_name") do
         records = @dataset.each.to_a
         assert_equal([
-                       1892,
+                       2142,
                        {
                          package: "AER",
                          dataset: "Affairs",

data/test/test-seaborn.rb CHANGED Viewed

@@ -14,6 +14,7 @@ class SeabornTest < Test::Unit::TestCase
                      {dataset: "car_crashes"},
                      {dataset: "diamonds"},
                      {dataset: "dots"},
+                     {dataset: "dowjones"},
                      {dataset: "exercise"},
                      {dataset: "flights"},
                      {dataset: "fmri"},

data/test/test-sudachi-synonym-dictionary.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
   test('#each') do
     records = @dataset.each.to_a
     assert_equal([
-                   65182,
+                   65206,
                    {
                      group_id: "000001",
                      is_noun: true,
@@ -19,7 +19,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
                      notation: "曖昧",
                    },
                    {
-                     group_id: "024909",
+                     group_id: "024916",
                      is_noun: true,
                      expansion_type: :expanded,
                      lexeme_id: 1,
@@ -27,7 +27,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
                      acronym_type: :alphabet,
                      variant_type: :typical,
                      categories: ["ビジネス"],
-                     notation: "BPO",
+                     notation: "SCM",
                    },
                  ],
                  [

data/test/test-wikipedia.rb CHANGED Viewed

@@ -1,100 +1,54 @@
 class WikipediaTest < Test::Unit::TestCase
-  sub_test_case("ja") do
+  sub_test_case("en") do
     sub_test_case("articles") do
-      include Helper::Sandbox
       def setup
-        setup_sandbox
-        @dataset = Datasets::Wikipedia.new(language: :ja,
+        @dataset = Datasets::Wikipedia.new(language: :en,
                                            type: :articles)
-        def @dataset.cache_dir_path
-          @cache_dir_path
-        end
-        def @dataset.cache_dir_path=(path)
-          @cache_dir_path = path
-        end
-        @dataset.cache_dir_path = @tmp_dir
-      end
-      def teardown
-        teardown_sandbox
       end
       test("#each") do
-        def @dataset.download(output_path, url)
-          xml_path = output_path.sub_ext("")
-          xml_path.open("w") do |xml_file|
-            xml_file.puts(<<-XML)
-<mediawiki
-   xmlns="http://www.mediawiki.org/xml/export-0.10/"
-   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-   xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
-   version="0.10" xml:lang="ja">
-  <siteinfo>
-    <sitename>Wikipedia</sitename>
-  </siteinfo>
-  <page>
-    <title>タイトル</title>
-    <ns>4</ns>
-    <id>1</id>
-    <restrictions>sysop</restrictions>
-    <revision>
-      <id>3</id>
-      <parentid>2</parentid>
-      <timestamp>2004-04-30T14:46:00Z</timestamp>
-      <contributor>
-        <username>user</username>
-        <id>10</id>
-      </contributor>
-      <minor />
-      <comment>コメント</comment>
-      <model>wikitext</model>
-      <format>text/x-wiki</format>
-      <text xml:space="preserve">テキスト</text>
-      <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
-    </revision>
-  </page>
-</mediawiki>
-            XML
-          end
-          unless system("bzip2", xml_path.to_s)
-            raise "failed to run bzip2"
-          end
-        end
-        contributor = Datasets::Wikipedia::Contributor.new("user", 10)
+        contributor = Datasets::Wikipedia::Contributor.new("Elli", 20842734)
         revision = Datasets::Wikipedia::Revision.new
-        revision.id = 3
-        revision.parent_id = 2
-        revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
+        revision.id = 1002250816
+        revision.parent_id = 854851586
+        revision.timestamp = Time.iso8601("2021-01-23T15:15:01Z")
         revision.contributor = contributor
-        revision.comment = "コメント"
+        revision.comment = "shel"
         revision.model = "wikitext"
         revision.format = "text/x-wiki"
-        revision.text = "テキスト"
-        revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
+        revision.text = <<-TEXT.chomp
+#REDIRECT [[Computer accessibility]]
+{{rcat shell|
+{{R from move}}
+{{R from CamelCase}}
+{{R unprintworthy}}
+}}
+        TEXT
+        revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
         page = Datasets::Wikipedia::Page.new
-        page.title = "タイトル"
-        page.namespace = 4
-        page.id = 1
-        page.restrictions = ["sysop"]
+        page.title = "AccessibleComputing"
+        page.namespace = 0
+        page.id = 10
+        page.restrictions = nil
+        page.redirect = "Computer accessibility"
         page.revision = revision
         assert_equal(page, @dataset.each.first)
       end
       sub_test_case("#metadata") do
         test("#id") do
-          assert_equal("wikipedia-ja-articles",
+          assert_equal("wikipedia-en-articles",
                        @dataset.metadata.id)
         end
         test("#name") do
-          assert_equal("Wikipedia articles (ja)",
+          assert_equal("Wikipedia articles (en)",
                        @dataset.metadata.name)
         end
         test("#description") do
-          assert_equal("Wikipedia articles in ja",
+          assert_equal("Wikipedia articles in en",
                        @dataset.metadata.description)
         end
       end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-datasets
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - tomisuker
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-09-23 00:00:00.000000000 Z
+date: 2023-05-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: csv
@@ -163,6 +163,7 @@ files:
 - lib/datasets/iris.rb
 - lib/datasets/ita-corpus.rb
 - lib/datasets/kuzushiji-mnist.rb
+- lib/datasets/lazy.rb
 - lib/datasets/libsvm-dataset-list.rb
 - lib/datasets/libsvm.rb
 - lib/datasets/license.rb
@@ -170,6 +171,7 @@ files:
 - lib/datasets/metadata.rb
 - lib/datasets/mnist.rb
 - lib/datasets/mushroom.rb
+- lib/datasets/nagoya-university-conversation-corpus.rb
 - lib/datasets/penguins.rb
 - lib/datasets/penn-treebank.rb
 - lib/datasets/pmjt-dataset-list.rb
@@ -214,6 +216,7 @@ files:
 - test/test-metadata.rb
 - test/test-mnist.rb
 - test/test-mushroom.rb
+- test/test-nagoya-university-conversation-corpus.rb
 - test/test-penguins.rb
 - test/test-penn-treebank.rb
 - test/test-pmjt-dataset-list.rb
@@ -245,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.32
+rubygems_version: 3.5.0.dev
 signing_key:
 specification_version: 4
 summary: Red Datasets provides classes that provide common datasets such as iris dataset.
@@ -278,6 +281,7 @@ test_files:
 - test/test-metadata.rb
 - test/test-mnist.rb
 - test/test-mushroom.rb
+- test/test-nagoya-university-conversation-corpus.rb
 - test/test-penguins.rb
 - test/test-penn-treebank.rb
 - test/test-pmjt-dataset-list.rb