RubyGems - red-datasets - Versions diffs - 0.0.1 → 0.0.2 - Mend

red-datasets 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA256:
-  metadata.gz: 21e6ccf743e4603d55a8c628213d3e8278b4c9a29f4914887cc4f148065e6edc
-  data.tar.gz: 5d45f1ed6e5e2ec6620243641322323b78d22bb52c1f06ed3b0a0fe2555a7dd7
+SHA1:
+  metadata.gz: 47a9f2cf4c17f8b64f0a88dc7738affbbcd316a0
+  data.tar.gz: 7255596f70ff903f9103b3d72e78799a622e626e
 SHA512:
-  metadata.gz: 791bb9a4953a6c7667e95ff8b479180e91ce25f4f090a55a119118884caf59482ec6cff3993486427e85156228041c2ece0734312210079a7a057067d709d277
-  data.tar.gz: 8f76e7a5d781d85767d1b476dac26e9b954d556e9e54d689b1cdbe0f881a8f165b6188cce072857b19b3f7c5f57537b82b962fb7c74a35102aad396cb9617ddd
+  metadata.gz: e4065c07b451443e7ea2ff395144ecce0d9fa09e2a48979ba3b35c0cd0f18e48b27858a608d9b52e42bb112072a8fee1bffabbad33572028e25ef4aa163fb4cd
+  data.tar.gz: 2e76c88a1bc14ffd4d4808d05a8c395415fa34dbb8b1189dbbe8e0afb8b573dd20b1f06338bd82f9cd01d413c1f0e83130748c7bffd3e623ac4e2bbe826bc9c2

data/doc/text/news.md CHANGED Viewed

@@ -1,6 +1,12 @@
 # News
-## 0.0.1 2018-01-08
+## 0.0.2 - 2018-02-06
+### Improvements
+  * `Datasets::Wikipedia`: Added a dataset for Wikipedia.
+## 0.0.1 - 2018-01-08
 ### Improvements

data/lib/datasets/dataset.rb CHANGED Viewed

@@ -1,7 +1,6 @@
-require "fileutils"
-require "open-uri"
 require "pathname"
+require_relative "downloader"
 require_relative "metadata"
 module Datasets
@@ -25,18 +24,8 @@ module Datasets
     end
     def download(output_path, url)
-      url = URI.parse(url) unless url.is_a?(URI::Generic)
-      output_path.parent.mkpath
-      begin
-        url.open do |input|
-          output_path.open("wb") do |output|
-            IO.copy_stream(input, output)
-          end
-        end
-      rescue
-        FileUtils.rm_f(output_path)
-        raise
-      end
+      downloader = Downloader.new(url)
+      downloader.download(output_path)
     end
   end
 end

data/lib/datasets/downloader.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require "fileutils"
+require "open-uri"
+module Datasets
+  class Downloader
+    def initialize(url)
+      url = URI.parse(url) unless url.is_a?(URI::Generic)
+      @url = url
+    end
+    def download(output_path)
+      output_path.parent.mkpath
+      if $stderr == STDERR and $stderr.tty?
+        max = nil
+        base_name = @url.path.split("/").last
+        content_length_proc = lambda do |content_length|
+          max = content_length
+        end
+        progress_proc = lambda do |current|
+          if max
+            percent = (current / max.to_f) * 100
+            formatted_size = "[%s/%s]" % [format_size(current), format_size(max)]
+            $stderr.print("\r%s - %06.2f%% %s" %
+                          [base_name, percent, formatted_size])
+            $stderr.puts if current == max
+          end
+        end
+        options = {
+          :content_length_proc => content_length_proc,
+          :progress_proc => progress_proc,
+        }
+      else
+        options = {}
+      end
+      begin
+        @url.open(options) do |input|
+          output_path.open("wb") do |output|
+            IO.copy_stream(input, output)
+          end
+        end
+      rescue
+        FileUtils.rm_f(output_path)
+        raise
+      end
+    end
+    private
+    def format_size(size)
+      if size < 1024
+        "%d" % size
+      elsif size < (1024 ** 2)
+        "%7.2fKiB" % (size.to_f / 1024)
+      elsif size < (1024 ** 3)
+        "%7.2fMiB" % (size.to_f / (1024 ** 2))
+      elsif size < (1024 ** 4)
+        "%7.2fGiB" % (size.to_f / (1024 ** 3))
+      else
+        "%.2fTiB" % (size.to_f / (1024 ** 4))
+      end
+    end
+  end
+end

data/lib/datasets/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Datasets
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

data/lib/datasets/wikipedia.rb ADDED Viewed

@@ -0,0 +1,178 @@
+require "rexml/streamlistener"
+require "rexml/parsers/baseparser"
+require "rexml/parsers/streamparser"
+require_relative "dataset"
+module Datasets
+  class Wikipedia < Dataset
+    Contributor = Struct.new(:user_name,
+                             :id)
+    Revision = Struct.new(:id,
+                          :parent_id,
+                          :timestamp,
+                          :contributor,
+                          :minor,
+                          :comment,
+                          :model,
+                          :format,
+                          :text,
+                          :sha1)
+    Page = Struct.new(:title,
+                      :namespace,
+                      :id,
+                      :restrictions,
+                      :redirect,
+                      :revision)
+    def initialize(language: :en,
+                   type: :articles)
+      super()
+      @language = language
+      @type = type
+      @metadata.name = "wikipedia-#{@language}-#{@type}"
+      @metadata.url = "https://dumps.wikimedia.org/"
+      @metadata.description = "Wikipedia #{@type} (#{@language})"
+    end
+    def each(&block)
+      return to_enum(__method__) unless block_given?
+      open_data do |input|
+        listener = ArticlesListener.new(block)
+        parser = REXML::Parsers::StreamParser.new(input, listener)
+        parser.parse
+      end
+    end
+    private
+    def open_data
+      base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
+      data_path = cache_dir_path + base_name
+      unless data_path.exist?
+        data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
+        download(data_path, data_url)
+      end
+      input, output = IO.pipe
+      pid = spawn("bzcat", data_path.to_s, {:out => output})
+      begin
+        output.close
+        yield(input)
+      ensure
+        input.close
+        Process.waitpid(pid)
+      end
+    end
+    def type_in_path
+      case @type
+      when :articles
+        "pages-articles"
+      else
+        @type.to_s
+      end
+    end
+    class ArticlesListener
+      include REXML::StreamListener
+      def initialize(block)
+        @block = block
+        @page = nil
+        @revision = nil
+        @contributor = nil
+        @current_tag = nil
+        @tag_stack = []
+        @text_stack = [""]
+        @first_page = true
+      end
+      def tag_start(name, attributes)
+        push_stacks(name)
+        case name
+        when "page"
+          @page = Page.new
+        when "revision"
+          @revision = Revision.new
+        when "contributor"
+          @contributor = Contributor.new
+        when "redirect"
+          @page.redirect = attributes["title"]
+        end
+      end
+      def tag_end(name)
+        case name
+        when "page"
+          on_page(@page)
+          @page = nil
+        when "title"
+          @page.title = @text_stack.last
+        when "ns"
+          @page.namespace = Integer(@text_stack.last)
+        when "id"
+          id = Integer(@text_stack.last)
+          case @tag_stack[-2]
+          when "page"
+            @page.id = id
+          when "revision"
+            @revision.id = id
+          when "contributor"
+            @contributor.id = id
+          end
+        when "restrictions"
+          @page.restrictions = @text_stack.last.split(":")
+        when "revision"
+          @page.revision = @revision
+          @revision = nil
+        when "parentid"
+          @revision.parent_id = Integer(@text_stack.last)
+        when "timestamp"
+          @revision.timestamp = Time.iso8601(@text_stack.last)
+        when "contributor"
+          @revision.contributor = @contributor
+          @contributor = nil
+        when "username"
+          @contributor.user_name = @text_stack.last
+        when "minor"
+          # TODO
+        when "comment"
+          @revision.comment = @text_stack.last
+        when "model"
+          @revision.model = @text_stack.last
+        when "format"
+          @revision.format = @text_stack.last
+        when "text"
+          @revision.text = @text_stack.last
+        when "sha1"
+          @revision.sha1 = @text_stack.last
+        end
+        pop_stacks
+      end
+      def text(data)
+        @text_stack.last << data
+      end
+      def cdata(contnet)
+        @text_stack.last << content
+      end
+      private
+      def on_page(page)
+        @block.call(page)
+      end
+      def push_stacks(tag)
+        @tag_stack << tag
+        @text_stack << ""
+      end
+      def pop_stacks
+        @text_stack.pop
+        @tag_stack.pop
+      end
+    end
+  end
+end

data/lib/datasets.rb CHANGED Viewed

@@ -1,3 +1,4 @@
 require "datasets/version"
 require "datasets/iris"
+require "datasets/wikipedia"

data/test/helper.rb CHANGED Viewed

@@ -1,3 +1,19 @@
+require "fileutils"
+require "pathname"
 require "datasets"
 require "test-unit"
+module Helper
+  module Sandbox
+    def setup_sandbox
+      @tmp_dir = (Pathname.new(__dir__) + "tmp").expand_path
+      FileUtils.mkdir_p(@tmp_dir)
+    end
+    def teardown_sandbox
+      FileUtils.rm_rf(@tmp_dir)
+    end
+  end
+end

data/test/test-wikipedia.rb ADDED Viewed

@@ -0,0 +1,98 @@
+class WikipediaTest < Test::Unit::TestCase
+  sub_test_case("ja") do
+    sub_test_case("articles") do
+      include Helper::Sandbox
+      def setup
+        setup_sandbox
+        @dataset = Datasets::Wikipedia.new(language: :ja,
+                                           type: :articles)
+        def @dataset.cache_dir_path
+          @cache_dir_path
+        end
+        def @dataset.cache_dir_path=(path)
+          @cache_dir_path = path
+        end
+        @dataset.cache_dir_path = @tmp_dir
+      end
+      def teardown
+        teardown_sandbox
+      end
+      test("#each") do
+        def @dataset.download(output_path, url)
+          xml_path = output_path.sub_ext("")
+          xml_path.open("w") do |xml_file|
+            xml_file.puts(<<-XML)
+<mediawiki
+   xmlns="http://www.mediawiki.org/xml/export-0.10/"
+   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+   xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
+   version="0.10" xml:lang="ja">
+  <siteinfo>
+    <sitename>Wikipedia</sitename>
+  </siteinfo>
+  <page>
+    <title>タイトル</title>
+    <ns>4</ns>
+    <id>1</id>
+    <restrictions>sysop</restrictions>
+    <revision>
+      <id>3</id>
+      <parentid>2</parentid>
+      <timestamp>2004-04-30T14:46:00Z</timestamp>
+      <contributor>
+        <username>user</username>
+        <id>10</id>
+      </contributor>
+      <minor />
+      <comment>コメント</comment>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+      <text xml:space="preserve">テキスト</text>
+      <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
+    </revision>
+  </page>
+</mediawiki>
+            XML
+          end
+          unless system("bzip2", xml_path.to_s)
+            raise "failed to run bzip2"
+          end
+        end
+        contributor = Datasets::Wikipedia::Contributor.new("user", 10)
+        revision = Datasets::Wikipedia::Revision.new
+        revision.id = 3
+        revision.parent_id = 2
+        revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
+        revision.contributor = contributor
+        revision.comment = "コメント"
+        revision.model = "wikitext"
+        revision.format = "text/x-wiki"
+        revision.text = "テキスト"
+        revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
+        page = Datasets::Wikipedia::Page.new
+        page.title = "タイトル"
+        page.namespace = 4
+        page.id = 1
+        page.restrictions = ["sysop"]
+        page.revision = revision
+        assert_equal(page, @dataset.each.first)
+      end
+      sub_test_case("#metadata") do
+        test("#name") do
+          assert_equal("wikipedia-ja-articles",
+                       @dataset.metadata.name)
+        end
+        test("#description") do
+          assert_equal("Wikipedia articles (ja)",
+                       @dataset.metadata.description)
+        end
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-datasets
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - tomisuker
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-01-08 00:00:00.000000000 Z
+date: 2018-02-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -100,13 +100,16 @@ files:
 - doc/text/news.md
 - lib/datasets.rb
 - lib/datasets/dataset.rb
+- lib/datasets/downloader.rb
 - lib/datasets/iris.rb
 - lib/datasets/metadata.rb
 - lib/datasets/version.rb
+- lib/datasets/wikipedia.rb
 - red-datasets.gemspec
 - test/helper.rb
 - test/run-test.rb
 - test/test-iris.rb
+- test/test-wikipedia.rb
 homepage: https://github.com/red-data-tools/red-datasets
 licenses:
 - MIT
@@ -127,11 +130,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.7.4
+rubygems_version: 2.5.2.2
 signing_key:
 specification_version: 4
 summary: Red Datasets provides classes that provide common datasets such as iris dataset.
 test_files:
 - test/test-iris.rb
-- test/run-test.rb
+- test/test-wikipedia.rb
 - test/helper.rb
+- test/run-test.rb