RubyGems - red-datasets - Versions diffs - 0.1.5 → 0.1.6 - Mend

red-datasets 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/README.md +0 -1
data/Rakefile +56 -1
data/doc/text/news.md +16 -0
data/lib/datasets/dataset.rb +50 -11
data/lib/datasets/downloader.rb +110 -35
data/lib/datasets/lazy.rb +90 -0
data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
data/lib/datasets/penguins.rb +2 -0
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wikipedia.rb +14 -5
data/lib/datasets/zip-extractor.rb +12 -0
data/lib/datasets.rb +2 -34
data/test/test-geolonia.rb +10 -9
data/test/test-nagoya-university-conversation-corpus.rb +132 -0
data/test/test-rdataset.rb +2 -2
data/test/test-seaborn.rb +1 -0
data/test/test-sudachi-synonym-dictionary.rb +3 -3
data/test/test-wikipedia.rb +25 -71
metadata +7 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c1cfd18b589e4624178d9010ef68a100bb6e2573ccf18a9f96168af786523578
-  data.tar.gz: 67eddd22e10bf78c0b2cf10b18de289368d473d7b5ddf2a557cc2264834e32b0
+  metadata.gz: 0f98b9ff3bc1734ecee79fde53518e86361c938b63801e73170c5aff3acc8dfa
+  data.tar.gz: 5b0189b610fb42ab59bfb39cd8a42534d98235b8b44676fe272ec2653f5cd0a9
 SHA512:
-  metadata.gz: 111243d3a1d3d758196bb71301ccb0f34beb1f5bec7c5c14b15f7c96fd6bdde924e30d90d3ace9e9258074411c9f7e7b4ef6bd9338dc5c11349534b2392f6f81
-  data.tar.gz: 9a9b426c753bd7e6cc12d452d61b90c2422fcad3b3c353a552c5c05a7c7fd53c3d4ac9cec2e33af1537d9e76e04f1df3d6d9b4baf043528fdde2ab4f9f203e9f
+  metadata.gz: 53c3990bdcaa712cad414ba3c9bda13d9bd12a3c07e3c53d4479e674700d8ffea3c7515b99357feeb6052c8eac97f0836b2c8fd5f67d4ab475f00e5351ecd272
+  data.tar.gz: 36c5c16e79cd346fdb061a6e2679ef85471043a6c5e795bc77beddf55866cbfbade25b6e8abf7fd990b088cb1af26574a899ac62e0ee2cafa738b222a0a19252

data/README.md CHANGED Viewed

@@ -1,6 +1,5 @@
 # Red Datasets
-[![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
 [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
 ## Description

data/Rakefile CHANGED Viewed

@@ -13,9 +13,64 @@ end
 helper.install
 spec = helper.gemspec
+task default: :test
 desc "Run tests"
 task :test do
   ruby("test/run-test.rb")
 end
-task default: :test
+desc "Generate an artifact for GitHub Pages"
+task :pages do
+  pages_dir = "_site"
+  rm_rf(pages_dir)
+  mkdir_p(pages_dir)
+  require "cgi/util"
+  require_relative "lib/datasets/lazy"
+  File.open("#{pages_dir}/index.html", "w") do |index_html|
+    index_html.puts(<<-HTML)
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="UTF-8">
+    <title>Red Datasets</title>
+    <style>
+      table {
+        margin-left: 20vw;
+        min-width: 50%;
+      }
+      th {
+        font-size: 30px;
+        padding: 20px;
+      }
+      td {
+        border-bottom: 1px solid #D9DCE0;
+        padding: 20px;
+        font-weight: bold;
+      }
+    </style>
+  </head>
+  <body>
+    <section>
+      <h1>Red Datasets</h1>
+      <table>
+        <thead>
+          <tr><th>Available datasets</th></tr>
+        </thead>
+        <tbody>
+    HTML
+    Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
+      index_html.puts(<<-HTML)
+          <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
+      HTML
+    end
+    index_html.puts(<<-HTML)
+        </tbody>
+      </table>
+    </section>
+  </body>
+</html>
+    HTML
+  end
+end

data/doc/text/news.md CHANGED Viewed

@@ -1,5 +1,21 @@
 # News
+## 0.1.6 - 2023-05-24
+### Improvements
+  * Added support for lazy loading by `require "datasets/lazy"`.
+  * `Datasets::NagoyaUniversityConversationCorpus`: Added.
+    [GH-168][https://github.com/red-data-tools/red-datasets/issues/168]
+    [Patch by matsuura]
+  * `Datasets::Wikipedia`: Added support for downloading in background.
+### Thanks
+  * matsuura
 ## 0.1.5 - 2022-09-22
 ### Improvements

data/lib/datasets/dataset.rb CHANGED Viewed

@@ -33,20 +33,59 @@ module Datasets
       @cache_path ||= CachePath.new(@metadata.id)
     end
-    def download(output_path, url)
+    def download(output_path, url, &block)
       downloader = Downloader.new(url)
-      downloader.download(output_path)
+      downloader.download(output_path, &block)
     end
-    def extract_bz2(path)
-      input, output = IO.pipe
-      pid = spawn("bzcat", path.to_s, {:out => output})
-      begin
-        output.close
-        yield(input)
-      ensure
-        input.close
-        Process.waitpid(pid)
+    def extract_bz2(bz2)
+      case bz2
+      when Pathname, String
+        IO.pipe do |input, output|
+          pid = spawn("bzcat", bz2.to_s, {out: output})
+          begin
+            output.close
+            yield(input)
+          ensure
+            input.close
+            Process.waitpid(pid)
+          end
+        end
+      else
+        IO.pipe do |bz2_input, bz2_output|
+          IO.pipe do |plain_input, plain_output|
+            bz2_stop = false
+            bz2_thread = Thread.new do
+              begin
+                bz2.each do |chunk|
+                  bz2_output.write(chunk)
+                  bz2_output.flush
+                  break if bz2_stop
+                end
+              rescue => error
+                message = "Failed to read bzcat input: " +
+                          "#{error.class}: #{error.message}"
+                $stderr.puts(message)
+              ensure
+                bz2_output.close
+              end
+            end
+            begin
+              pid = spawn("bzcat", {in: bz2_input, out: plain_output})
+              begin
+                bz2_input.close
+                plain_output.close
+                yield(plain_input)
+              ensure
+                plain_input.close
+                Process.waitpid(pid)
+              end
+            ensure
+              bz2_stop = true
+              bz2_thread.join
+            end
+          end
+        end
       end
     end
   end

data/lib/datasets/downloader.rb CHANGED Viewed

@@ -22,50 +22,115 @@ module Datasets
       end
     end
-    def download(output_path)
-      return if output_path.exist?
-      output_path.parent.mkpath
+    def download(output_path, &block)
+      if output_path.exist?
+        yield_chunks(output_path, &block) if block_given?
+        return
+      end
-      headers = {
-        "Accept-Encoding" => "identity",
-        "User-Agent" => "Red Datasets/#{VERSION}",
-      }
-      start = nil
       partial_output_path = Pathname.new("#{output_path}.partial")
-      if partial_output_path.exist?
-        start = partial_output_path.size
-        headers["Range"] = "bytes=#{start}-"
-      end
+      synchronize(output_path, partial_output_path) do
+        output_path.parent.mkpath
-      start_http(@url, headers) do |response|
-        if response.is_a?(Net::HTTPPartialContent)
-          mode = "ab"
-        else
+        n_retries = 0
+        n_max_retries = 5
+        begin
+          headers = {
+            "Accept-Encoding" => "identity",
+            "User-Agent" => "Red Datasets/#{VERSION}",
+          }
           start = nil
-          mode = "wb"
-        end
+          if partial_output_path.exist?
+            start = partial_output_path.size
+            headers["Range"] = "bytes=#{start}-"
+          end
+          start_http(@url, headers) do |response|
+            if response.is_a?(Net::HTTPPartialContent)
+              mode = "ab"
+            else
+              start = nil
+              mode = "wb"
+            end
-        base_name = @url.path.split("/").last
-        size_current = 0
-        size_max = response.content_length
-        if start
-          size_current += start
-          size_max += start
+            base_name = @url.path.split("/").last
+            size_current = 0
+            size_max = response.content_length
+            if start
+              size_current += start
+              size_max += start
+              if block_given? and n_retries.zero?
+                yield_chunks(partial_output_path, &block)
+              end
+            end
+            progress_reporter = ProgressReporter.new(base_name, size_max)
+            partial_output_path.open(mode) do |output|
+              response.read_body do |chunk|
+                size_current += chunk.bytesize
+                progress_reporter.report(size_current)
+                output.write(chunk)
+                yield(chunk) if block_given?
+              end
+            end
+          end
+          FileUtils.mv(partial_output_path, output_path)
+        rescue Net::ReadTimeout => error
+          n_retries += 1
+          retry if n_retries < n_max_retries
+          raise
+        rescue TooManyRedirects => error
+          last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
+          raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
         end
-        progress_reporter = ProgressReporter.new(base_name, size_max)
-        partial_output_path.open(mode) do |output|
-          response.read_body do |chunk|
-            size_current += chunk.bytesize
-            progress_reporter.report(size_current)
-            output.write(chunk)
+      end
+    end
+    private def synchronize(output_path, partial_output_path)
+      begin
+        Process.getpgid(Process.pid)
+      rescue NotImplementedError
+        return yield
+      end
+      lock_path = Pathname("#{output_path}.lock")
+      loop do
+        lock_path.parent.mkpath
+        begin
+          lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
+        rescue SystemCallError
+          valid_lock_path = true
+          begin
+            pid = Integer(lock_path.read.chomp, 10)
+          rescue ArgumentError
+            # The process that acquired the lock will be exited before
+            # it stores its process ID.
+            valid_lock_path = (lock_path.mtime > 10)
+          else
+            begin
+              Process.getpgid(pid)
+            rescue SystemCallError
+              # Process that acquired the lock doesn't exist
+              valid_lock_path = false
+            end
+          end
+          if valid_lock_path
+            sleep(1 + rand(10))
+          else
+            lock_path.delete
           end
+          retry
+        else
+          begin
+            lock.puts(Process.pid.to_s)
+            lock.flush
+            yield
+          ensure
+            lock.close
+            lock_path.delete
+          end
+          break
         end
       end
-      FileUtils.mv(partial_output_path, output_path)
-    rescue TooManyRedirects => error
-      last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
-      raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
     end
     private def start_http(url, headers, limit = 10, &block)
@@ -99,6 +164,16 @@ module Datasets
       end
     end
+    private def yield_chunks(path)
+      path.open("rb") do |output|
+        chunk_size = 1024 * 1024
+        chunk = ""
+        while output.read(chunk_size, chunk)
+          yield(chunk)
+        end
+      end
+    end
     class ProgressReporter
       def initialize(base_name, size_max)
         @base_name = base_name

data/lib/datasets/lazy.rb ADDED Viewed

@@ -0,0 +1,90 @@
+require_relative "version"
+module Datasets
+  class LazyLoader
+    def initialize
+      @constants = {}
+    end
+    def exist?(constant_name)
+      @constants.key?(constant_name)
+    end
+    def load(constant_name)
+      feature = @constants[constant_name]
+      raise LoadError, "unknown dataset: #{constant_name}" unless feature
+      require feature
+    end
+    def load_all
+      @constants.each_value do |feature|
+        require feature
+      end
+    end
+    def register(constant_name, feature)
+      @constants[constant_name] = feature
+    end
+    def constant_names
+      @constants.keys
+    end
+  end
+  LAZY_LOADER = LazyLoader.new
+  class << self
+    def const_missing(name)
+      if LAZY_LOADER.exist?(name)
+        LAZY_LOADER.load(name)
+        const_get(name)
+      else
+        super
+      end
+    end
+  end
+  LAZY_LOADER.register(:Adult, "datasets/adult")
+  LAZY_LOADER.register(:AFINN, "datasets/afinn")
+  LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
+  LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
+  LAZY_LOADER.register(:CIFAR, "datasets/cifar")
+  LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
+  LAZY_LOADER.register(:Communities, "datasets/communities")
+  LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
+  LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
+  LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
+  LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
+  LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
+  LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
+  LAZY_LOADER.register(:Iris, "datasets/iris")
+  LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
+  LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
+  LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
+  LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
+  LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
+  LAZY_LOADER.register(:MNIST, "datasets/mnist")
+  LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
+  LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
+                       "datasets/nagoya-university-conversation-corpus")
+  LAZY_LOADER.register(:Penguins, "datasets/penguins")
+  LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
+  LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
+  LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
+  LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
+                       "datasets/quora-duplicate-question-pair")
+  LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
+  # For backward compatibility
+  LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
+  LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
+  # For backward compatibility
+  LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
+  LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
+  LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
+  LAZY_LOADER.register(:SudachiSynonymDictionary,
+                       "datasets/sudachi-synonym-dictionary")
+  LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
+  LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
+                       "datasets/wikipedia-kyoto-japanese-english")
+  LAZY_LOADER.register(:Wine, "datasets/wine")
+end

data/lib/datasets/nagoya-university-conversation-corpus.rb ADDED Viewed

@@ -0,0 +1,109 @@
+require_relative 'dataset'
+require_relative 'zip-extractor'
+module Datasets
+  class NagoyaUniversityConversationCorpus < Dataset
+    Data = Struct.new(
+      :name,
+      :date,
+      :place,
+      :participants,
+      :relationships,
+      :note,
+      :sentences
+    )
+    Participant = Struct.new(
+      :id,
+      :attribute,
+      :birthplace,
+      :residence
+    )
+    Sentence = Struct.new(:participant_id, :content) do
+      def end?
+        participant_id.nil? and content.nil?
+      end
+    end
+    def initialize
+      super()
+      @metadata.id = 'nagoya-university-conversation-curpus'
+      @metadata.name = 'Nagoya University Conversation Curpus'
+      @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
+      @metadata.licenses = ['CC-BY-NC-ND-4.0']
+      @metadata.description = <<~DESCRIPTION
+        The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
+        total about 100 hours of chatting among native speakers of Japanese,
+        which is converted into text.
+      DESCRIPTION
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |input_stream|
+        yield(parse_file(input_stream))
+      end
+    end
+    private
+    def open_data
+      data_path = cache_dir_path + 'nucc.zip'
+      data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
+      download(data_path, data_url)
+      extractor = ZipExtractor.new(data_path)
+      extractor.extract_files do |input_stream|
+        yield(input_stream)
+      end
+    end
+    def parse_file(input_stream)
+      data = Data.new
+      participants = []
+      sentences = []
+      input_stream.each do |input|
+        input.each_line(chomp: true) do |line|
+          line.force_encoding('utf-8')
+          if line.start_with?('＠データ')
+            data.name = line[4..]
+          elsif line.start_with?('＠収集年月日')
+            # mixed cases with and without'：'
+            data.date = line[6..].delete_prefix('：')
+          elsif line.start_with?('＠場所')
+            data.place = line[4..]
+          elsif line.start_with?('＠参加者の関係')
+            data.relationships = line.split('：', 2)[1]
+          elsif line.start_with?('＠参加者')
+            participant = Participant.new
+            participant.id, profiles = line[4..].split('：', 2)
+            participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
+            participants << participant
+          elsif line.start_with?('％ｃｏｍ')
+            data.note = line.split('：', 2)[1]
+          elsif line == '＠ＥＮＤ'
+            sentence = Sentence.new
+            sentence.participant_id = nil
+            sentence.content = nil
+            sentences << sentence
+          else
+            sentence = Sentence.new
+            sentence.participant_id, sentence.content = line.split('：', 2)
+            sentences << sentence
+          end
+        end
+      end
+      data.participants = participants
+      data.sentences = sentences
+      data
+    end
+  end
+end

data/lib/datasets/penguins.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require "csv"
 require_relative "dataset"
 module Datasets

data/lib/datasets/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Datasets
-  VERSION = "0.1.5"
+  VERSION = "0.1.6"
 end

data/lib/datasets/wikipedia.rb CHANGED Viewed

@@ -53,13 +53,22 @@ module Datasets
     end
     private
+    def base_name
+      "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
+    end
+    def data_path
+      cache_dir_path + base_name
+    end
     def open_data(&block)
-      base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
-      data_path = cache_dir_path + base_name
       data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
-      download(data_path, data_url)
-      extract_bz2(data_path, &block)
+      bz2 = Enumerator.new do |yielder|
+        download(data_path, data_url) do |bz2_chunk|
+          yielder << bz2_chunk
+        end
+      end
+      extract_bz2(bz2, &block)
     end
     def type_in_path

data/lib/datasets/zip-extractor.rb CHANGED Viewed

@@ -32,5 +32,17 @@ module Datasets
       end
       nil
     end
+    def extract_files
+      Zip::File.open(@path) do |zip_file|
+        zip_file.each do |entry|
+          next unless entry.file?
+          entry.get_input_stream do |input|
+            yield(input)
+          end
+        end
+      end
+    end
   end
 end

data/lib/datasets.rb CHANGED Viewed

@@ -1,34 +1,2 @@
-require_relative "datasets/version"
-require_relative "datasets/adult"
-require_relative "datasets/afinn"
-require_relative "datasets/aozora-bunko"
-require_relative "datasets/california-housing"
-require_relative "datasets/cifar"
-require_relative "datasets/cldr-plurals"
-require_relative "datasets/communities"
-require_relative "datasets/diamonds"
-require_relative "datasets/e-stat-japan"
-require_relative "datasets/fashion-mnist"
-require_relative "datasets/fuel-economy"
-require_relative "datasets/geolonia"
-require_relative "datasets/hepatitis"
-require_relative "datasets/iris"
-require_relative "datasets/ita-corpus"
-require_relative "datasets/kuzushiji-mnist"
-require_relative "datasets/libsvm"
-require_relative "datasets/libsvm-dataset-list"
-require_relative "datasets/livedoor-news"
-require_relative "datasets/mnist"
-require_relative "datasets/mushroom"
-require_relative "datasets/penguins"
-require_relative "datasets/penn-treebank"
-require_relative "datasets/pmjt-dataset-list"
-require_relative "datasets/postal-code-japan"
-require_relative "datasets/quora-duplicate-question-pair"
-require_relative "datasets/rdataset"
-require_relative "datasets/seaborn"
-require_relative "datasets/sudachi-synonym-dictionary"
-require_relative "datasets/wikipedia"
-require_relative "datasets/wikipedia-kyoto-japanese-english"
-require_relative "datasets/wine"
+require_relative "datasets/lazy"
+Datasets::LAZY_LOADER.load_all

data/test/test-geolonia.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class GeoloniaTest < Test::Unit::TestCase
   test('#each') do
     records = @dataset.each.to_a
     assert_equal([
-                   277191,
+                   277616,
                    {
                      :prefecture_code => "01",
                      :prefecture_name => "北海道",
@@ -28,16 +28,16 @@ class GeoloniaTest < Test::Unit::TestCase
                      :prefecture_name => "沖縄県",
                      :prefecture_kana => "オキナワケン",
                      :prefecture_romaji => "OKINAWA KEN",
-                     :municipality_code => "47325",
-                     :municipality_name => "中頭郡嘉手納町",
-                     :municipality_kana => "ナカガミグンカデナチョウ",
-                     :municipality_romaji => "NAKAGAMI GUN KADENA CHO",
-                     :street_name => "字兼久",
+                     :municipality_code => "47382",
+                     :municipality_name => "八重山郡与那国町",
+                     :municipality_kana => "ヤエヤマグンヨナグニチョウ",
+                     :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
+                     :street_name => "字与那国",
                      :street_kana => nil,
                      :street_romaji => nil,
-                     :alias => "下原",
-                     :latitude => "26.351841",
-                     :longitude => "127.744975",
+                     :alias => nil,
+                     :latitude => "24.455925",
+                     :longitude => "122.987678",
                    },
                  ],
                  [
@@ -55,6 +55,7 @@ class GeoloniaTest < Test::Unit::TestCase
                      "## 住所データ仕様",
                      "### ファイルフォーマット",
                      "### 列",
+                     "### ソート順",
                    ],
                    description.scan(/^#.*$/),
                    description)

data/test/test-nagoya-university-conversation-corpus.rb ADDED Viewed

@@ -0,0 +1,132 @@
+class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
+  def setup
+    @dataset = Datasets::NagoyaUniversityConversationCorpus.new
+  end
+  sub_test_case("each") do
+    test("#sentences") do
+      records = @dataset.each.to_a
+      first_sentences = records[0].sentences
+      last_sentences = records[-1].sentences
+      assert_equal([
+                     856,
+                     {
+                       participant_id: 'F107',
+                       content: '＊＊＊の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても１時間ぐらいですよね。'
+                     },
+                     {
+                       participant_id: nil,
+                       content: nil
+                     },
+                     603,
+                     {
+                       participant_id: 'F007',
+                       content: 'それでは話を始めまーす。'
+                     },
+                     {
+                       participant_id: nil,
+                       content: nil
+                     }
+                   ],
+                   [
+                     first_sentences.size,
+                     first_sentences[0].to_h,
+                     first_sentences[-1].to_h,
+                     last_sentences.size,
+                     last_sentences[0].to_h,
+                     last_sentences[-1].to_h,
+                   ])
+    end
+    test("#participants") do
+      records = @dataset.each.to_a
+      first_participants = records[0].participants
+      last_participants = records[-1].participants
+      assert_equal([
+                     4,
+                     {
+                       id: 'F107',
+                       attribute: '女性３０代後半',
+                       birthplace: '愛知県幡豆郡出身',
+                       residence: '愛知県幡豆郡在住'
+                     },
+                     {
+                       id: 'F128',
+                       attribute: '女性２０代前半',
+                       birthplace: '愛知県西尾市出身',
+                       residence: '西尾市在住'
+                     },
+                     2,
+                     {
+                       id: 'F007',
+                       attribute: '女性５０代後半',
+                       birthplace: '東京都出身',
+                       residence: '東京都国分寺市在住'
+                     },
+                     {
+                       id: 'F003',
+                       attribute: '女性８０代後半',
+                       birthplace: '栃木県宇都宮市出身',
+                       residence: '国分寺市在住'
+                     }
+                   ],
+                   [
+                     first_participants.size,
+                     first_participants[0].to_h,
+                     first_participants[-1].to_h,
+                     last_participants.size,
+                     last_participants[0].to_h,
+                     last_participants[-1].to_h
+                   ])
+    end
+    test("others") do
+      records = @dataset.each.to_a
+      assert_equal([
+                     129,
+                     [
+                       '１（約３５分）',
+                       '２００１年１０月１６日',
+                       'ファミリーレストラン',
+                       '英会話教室の友人',
+                       nil
+                     ],
+                     [
+                       '１２９（３６分）',
+                       '２００３年２月１６日',
+                       '二人の自宅',
+                       '母と娘',
+                       'F007は東京に３８年、F003は東京に６０年居住。'
+                    ]
+                   ],
+                   [
+                     records.size,
+                     [
+                       records[0].name,
+                       records[0].date,
+                       records[0].place,
+                       records[0].relationships,
+                       records[0].note
+                     ],
+                     [
+                       records[-1].name,
+                       records[-1].date,
+                       records[-1].place,
+                       records[-1].relationships,
+                       records[-1].note
+                     ]
+                   ])
+    end
+  end
+  sub_test_case("#metadata") do
+    test("#description") do
+      description = @dataset.metadata.description
+      assert_equal(<<~DESCRIPTION, description)
+        The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
+        total about 100 hours of chatting among native speakers of Japanese,
+        which is converted into text.
+      DESCRIPTION
+    end
+  end
+end

data/test/test-rdataset.rb CHANGED Viewed

@@ -8,7 +8,7 @@ class RdatasetTest < Test::Unit::TestCase
       test("with package_name") do
         records = @dataset.filter(package: "datasets").to_a
         assert_equal([
-                       84,
+                       102,
                        {
                          package: "datasets",
                          dataset: "ability.cov",
@@ -48,7 +48,7 @@ class RdatasetTest < Test::Unit::TestCase
       test("without package_name") do
         records = @dataset.each.to_a
         assert_equal([
-                       1892,
+                       2142,
                        {
                          package: "AER",
                          dataset: "Affairs",

data/test/test-seaborn.rb CHANGED Viewed

@@ -14,6 +14,7 @@ class SeabornTest < Test::Unit::TestCase
                      {dataset: "car_crashes"},
                      {dataset: "diamonds"},
                      {dataset: "dots"},
+                     {dataset: "dowjones"},
                      {dataset: "exercise"},
                      {dataset: "flights"},
                      {dataset: "fmri"},

data/test/test-sudachi-synonym-dictionary.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
   test('#each') do
     records = @dataset.each.to_a
     assert_equal([
-                   65182,
+                   65206,
                    {
                      group_id: "000001",
                      is_noun: true,
@@ -19,7 +19,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
                      notation: "曖昧",
                    },
                    {
-                     group_id: "024909",
+                     group_id: "024916",
                      is_noun: true,
                      expansion_type: :expanded,
                      lexeme_id: 1,
@@ -27,7 +27,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
                      acronym_type: :alphabet,
                      variant_type: :typical,
                      categories: ["ビジネス"],
-                     notation: "BPO",
+                     notation: "SCM",
                    },
                  ],
                  [

data/test/test-wikipedia.rb CHANGED Viewed

@@ -1,100 +1,54 @@
 class WikipediaTest < Test::Unit::TestCase
-  sub_test_case("ja") do
+  sub_test_case("en") do
     sub_test_case("articles") do
-      include Helper::Sandbox
       def setup
-        setup_sandbox
-        @dataset = Datasets::Wikipedia.new(language: :ja,
+        @dataset = Datasets::Wikipedia.new(language: :en,
                                            type: :articles)
-        def @dataset.cache_dir_path
-          @cache_dir_path
-        end
-        def @dataset.cache_dir_path=(path)
-          @cache_dir_path = path
-        end
-        @dataset.cache_dir_path = @tmp_dir
-      end
-      def teardown
-        teardown_sandbox
       end
       test("#each") do
-        def @dataset.download(output_path, url)
-          xml_path = output_path.sub_ext("")
-          xml_path.open("w") do |xml_file|
-            xml_file.puts(<<-XML)
-<mediawiki
-   xmlns="http://www.mediawiki.org/xml/export-0.10/"
-   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-   xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
-   version="0.10" xml:lang="ja">
-  <siteinfo>
-    <sitename>Wikipedia</sitename>
-  </siteinfo>
-  <page>
-    <title>タイトル</title>
-    <ns>4</ns>
-    <id>1</id>
-    <restrictions>sysop</restrictions>
-    <revision>
-      <id>3</id>
-      <parentid>2</parentid>
-      <timestamp>2004-04-30T14:46:00Z</timestamp>
-      <contributor>
-        <username>user</username>
-        <id>10</id>
-      </contributor>
-      <minor />
-      <comment>コメント</comment>
-      <model>wikitext</model>
-      <format>text/x-wiki</format>
-      <text xml:space="preserve">テキスト</text>
-      <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
-    </revision>
-  </page>
-</mediawiki>
-            XML
-          end
-          unless system("bzip2", xml_path.to_s)
-            raise "failed to run bzip2"
-          end
-        end
-        contributor = Datasets::Wikipedia::Contributor.new("user", 10)
+        contributor = Datasets::Wikipedia::Contributor.new("Elli", 20842734)
         revision = Datasets::Wikipedia::Revision.new
-        revision.id = 3
-        revision.parent_id = 2
-        revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
+        revision.id = 1002250816
+        revision.parent_id = 854851586
+        revision.timestamp = Time.iso8601("2021-01-23T15:15:01Z")
         revision.contributor = contributor
-        revision.comment = "コメント"
+        revision.comment = "shel"
         revision.model = "wikitext"
         revision.format = "text/x-wiki"
-        revision.text = "テキスト"
-        revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
+        revision.text = <<-TEXT.chomp
+#REDIRECT [[Computer accessibility]]
+{{rcat shell|
+{{R from move}}
+{{R from CamelCase}}
+{{R unprintworthy}}
+}}
+        TEXT
+        revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
         page = Datasets::Wikipedia::Page.new
-        page.title = "タイトル"
-        page.namespace = 4
-        page.id = 1
-        page.restrictions = ["sysop"]
+        page.title = "AccessibleComputing"
+        page.namespace = 0
+        page.id = 10
+        page.restrictions = nil
+        page.redirect = "Computer accessibility"
         page.revision = revision
         assert_equal(page, @dataset.each.first)
       end
       sub_test_case("#metadata") do
         test("#id") do
-          assert_equal("wikipedia-ja-articles",
+          assert_equal("wikipedia-en-articles",
                        @dataset.metadata.id)
         end
         test("#name") do
-          assert_equal("Wikipedia articles (ja)",
+          assert_equal("Wikipedia articles (en)",
                        @dataset.metadata.name)
         end
         test("#description") do
-          assert_equal("Wikipedia articles in ja",
+          assert_equal("Wikipedia articles in en",
                        @dataset.metadata.description)
         end
       end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-datasets
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - tomisuker
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-09-23 00:00:00.000000000 Z
+date: 2023-05-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: csv
@@ -163,6 +163,7 @@ files:
 - lib/datasets/iris.rb
 - lib/datasets/ita-corpus.rb
 - lib/datasets/kuzushiji-mnist.rb
+- lib/datasets/lazy.rb
 - lib/datasets/libsvm-dataset-list.rb
 - lib/datasets/libsvm.rb
 - lib/datasets/license.rb
@@ -170,6 +171,7 @@ files:
 - lib/datasets/metadata.rb
 - lib/datasets/mnist.rb
 - lib/datasets/mushroom.rb
+- lib/datasets/nagoya-university-conversation-corpus.rb
 - lib/datasets/penguins.rb
 - lib/datasets/penn-treebank.rb
 - lib/datasets/pmjt-dataset-list.rb
@@ -214,6 +216,7 @@ files:
 - test/test-metadata.rb
 - test/test-mnist.rb
 - test/test-mushroom.rb
+- test/test-nagoya-university-conversation-corpus.rb
 - test/test-penguins.rb
 - test/test-penn-treebank.rb
 - test/test-pmjt-dataset-list.rb
@@ -245,7 +248,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.32
+rubygems_version: 3.5.0.dev
 signing_key:
 specification_version: 4
 summary: Red Datasets provides classes that provide common datasets such as iris dataset.
@@ -278,6 +281,7 @@ test_files:
 - test/test-metadata.rb
 - test/test-mnist.rb
 - test/test-mushroom.rb
+- test/test-nagoya-university-conversation-corpus.rb
 - test/test-penguins.rb
 - test/test-penn-treebank.rb
 - test/test-pmjt-dataset-list.rb