red-datasets 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 21e6ccf743e4603d55a8c628213d3e8278b4c9a29f4914887cc4f148065e6edc
4
- data.tar.gz: 5d45f1ed6e5e2ec6620243641322323b78d22bb52c1f06ed3b0a0fe2555a7dd7
2
+ SHA1:
3
+ metadata.gz: 47a9f2cf4c17f8b64f0a88dc7738affbbcd316a0
4
+ data.tar.gz: 7255596f70ff903f9103b3d72e78799a622e626e
5
5
  SHA512:
6
- metadata.gz: 791bb9a4953a6c7667e95ff8b479180e91ce25f4f090a55a119118884caf59482ec6cff3993486427e85156228041c2ece0734312210079a7a057067d709d277
7
- data.tar.gz: 8f76e7a5d781d85767d1b476dac26e9b954d556e9e54d689b1cdbe0f881a8f165b6188cce072857b19b3f7c5f57537b82b962fb7c74a35102aad396cb9617ddd
6
+ metadata.gz: e4065c07b451443e7ea2ff395144ecce0d9fa09e2a48979ba3b35c0cd0f18e48b27858a608d9b52e42bb112072a8fee1bffabbad33572028e25ef4aa163fb4cd
7
+ data.tar.gz: 2e76c88a1bc14ffd4d4808d05a8c395415fa34dbb8b1189dbbe8e0afb8b573dd20b1f06338bd82f9cd01d413c1f0e83130748c7bffd3e623ac4e2bbe826bc9c2
data/doc/text/news.md CHANGED
@@ -1,6 +1,12 @@
1
1
  # News
2
2
 
3
- ## 0.0.1 2018-01-08
3
+ ## 0.0.2 - 2018-02-06
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::Wikipedia`: Added a dataset for Wikipedia.
8
+
9
+ ## 0.0.1 - 2018-01-08
4
10
 
5
11
  ### Improvements
6
12
 
@@ -1,7 +1,6 @@
1
- require "fileutils"
2
- require "open-uri"
3
1
  require "pathname"
4
2
 
3
+ require_relative "downloader"
5
4
  require_relative "metadata"
6
5
 
7
6
  module Datasets
@@ -25,18 +24,8 @@ module Datasets
25
24
  end
26
25
 
27
26
  def download(output_path, url)
28
- url = URI.parse(url) unless url.is_a?(URI::Generic)
29
- output_path.parent.mkpath
30
- begin
31
- url.open do |input|
32
- output_path.open("wb") do |output|
33
- IO.copy_stream(input, output)
34
- end
35
- end
36
- rescue
37
- FileUtils.rm_f(output_path)
38
- raise
39
- end
27
+ downloader = Downloader.new(url)
28
+ downloader.download(output_path)
40
29
  end
41
30
  end
42
31
  end
@@ -0,0 +1,64 @@
1
+ require "fileutils"
2
+ require "open-uri"
3
+
4
+ module Datasets
5
+ class Downloader
6
+ def initialize(url)
7
+ url = URI.parse(url) unless url.is_a?(URI::Generic)
8
+ @url = url
9
+ end
10
+
11
+ def download(output_path)
12
+ output_path.parent.mkpath
13
+
14
+ if $stderr == STDERR and $stderr.tty?
15
+ max = nil
16
+ base_name = @url.path.split("/").last
17
+ content_length_proc = lambda do |content_length|
18
+ max = content_length
19
+ end
20
+ progress_proc = lambda do |current|
21
+ if max
22
+ percent = (current / max.to_f) * 100
23
+ formatted_size = "[%s/%s]" % [format_size(current), format_size(max)]
24
+ $stderr.print("\r%s - %06.2f%% %s" %
25
+ [base_name, percent, formatted_size])
26
+ $stderr.puts if current == max
27
+ end
28
+ end
29
+ options = {
30
+ :content_length_proc => content_length_proc,
31
+ :progress_proc => progress_proc,
32
+ }
33
+ else
34
+ options = {}
35
+ end
36
+
37
+ begin
38
+ @url.open(options) do |input|
39
+ output_path.open("wb") do |output|
40
+ IO.copy_stream(input, output)
41
+ end
42
+ end
43
+ rescue
44
+ FileUtils.rm_f(output_path)
45
+ raise
46
+ end
47
+ end
48
+
49
+ private
50
+ def format_size(size)
51
+ if size < 1024
52
+ "%d" % size
53
+ elsif size < (1024 ** 2)
54
+ "%7.2fKiB" % (size.to_f / 1024)
55
+ elsif size < (1024 ** 3)
56
+ "%7.2fMiB" % (size.to_f / (1024 ** 2))
57
+ elsif size < (1024 ** 4)
58
+ "%7.2fGiB" % (size.to_f / (1024 ** 3))
59
+ else
60
+ "%.2fTiB" % (size.to_f / (1024 ** 4))
61
+ end
62
+ end
63
+ end
64
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,178 @@
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
4
+
5
+ require_relative "dataset"
6
+
7
+ module Datasets
8
+ class Wikipedia < Dataset
9
+ Contributor = Struct.new(:user_name,
10
+ :id)
11
+ Revision = Struct.new(:id,
12
+ :parent_id,
13
+ :timestamp,
14
+ :contributor,
15
+ :minor,
16
+ :comment,
17
+ :model,
18
+ :format,
19
+ :text,
20
+ :sha1)
21
+ Page = Struct.new(:title,
22
+ :namespace,
23
+ :id,
24
+ :restrictions,
25
+ :redirect,
26
+ :revision)
27
+
28
+ def initialize(language: :en,
29
+ type: :articles)
30
+ super()
31
+ @language = language
32
+ @type = type
33
+ @metadata.name = "wikipedia-#{@language}-#{@type}"
34
+ @metadata.url = "https://dumps.wikimedia.org/"
35
+ @metadata.description = "Wikipedia #{@type} (#{@language})"
36
+ end
37
+
38
+ def each(&block)
39
+ return to_enum(__method__) unless block_given?
40
+
41
+ open_data do |input|
42
+ listener = ArticlesListener.new(block)
43
+ parser = REXML::Parsers::StreamParser.new(input, listener)
44
+ parser.parse
45
+ end
46
+ end
47
+
48
+ private
49
+ def open_data
50
+ base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
51
+ data_path = cache_dir_path + base_name
52
+ unless data_path.exist?
53
+ data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
54
+ download(data_path, data_url)
55
+ end
56
+
57
+ input, output = IO.pipe
58
+ pid = spawn("bzcat", data_path.to_s, {:out => output})
59
+ begin
60
+ output.close
61
+ yield(input)
62
+ ensure
63
+ input.close
64
+ Process.waitpid(pid)
65
+ end
66
+ end
67
+
68
+ def type_in_path
69
+ case @type
70
+ when :articles
71
+ "pages-articles"
72
+ else
73
+ @type.to_s
74
+ end
75
+ end
76
+
77
+ class ArticlesListener
78
+ include REXML::StreamListener
79
+
80
+ def initialize(block)
81
+ @block = block
82
+ @page = nil
83
+ @revision = nil
84
+ @contributor = nil
85
+ @current_tag = nil
86
+ @tag_stack = []
87
+ @text_stack = [""]
88
+ @first_page = true
89
+ end
90
+
91
+ def tag_start(name, attributes)
92
+ push_stacks(name)
93
+ case name
94
+ when "page"
95
+ @page = Page.new
96
+ when "revision"
97
+ @revision = Revision.new
98
+ when "contributor"
99
+ @contributor = Contributor.new
100
+ when "redirect"
101
+ @page.redirect = attributes["title"]
102
+ end
103
+ end
104
+
105
+ def tag_end(name)
106
+ case name
107
+ when "page"
108
+ on_page(@page)
109
+ @page = nil
110
+ when "title"
111
+ @page.title = @text_stack.last
112
+ when "ns"
113
+ @page.namespace = Integer(@text_stack.last)
114
+ when "id"
115
+ id = Integer(@text_stack.last)
116
+ case @tag_stack[-2]
117
+ when "page"
118
+ @page.id = id
119
+ when "revision"
120
+ @revision.id = id
121
+ when "contributor"
122
+ @contributor.id = id
123
+ end
124
+ when "restrictions"
125
+ @page.restrictions = @text_stack.last.split(":")
126
+ when "revision"
127
+ @page.revision = @revision
128
+ @revision = nil
129
+ when "parentid"
130
+ @revision.parent_id = Integer(@text_stack.last)
131
+ when "timestamp"
132
+ @revision.timestamp = Time.iso8601(@text_stack.last)
133
+ when "contributor"
134
+ @revision.contributor = @contributor
135
+ @contributor = nil
136
+ when "username"
137
+ @contributor.user_name = @text_stack.last
138
+ when "minor"
139
+ # TODO
140
+ when "comment"
141
+ @revision.comment = @text_stack.last
142
+ when "model"
143
+ @revision.model = @text_stack.last
144
+ when "format"
145
+ @revision.format = @text_stack.last
146
+ when "text"
147
+ @revision.text = @text_stack.last
148
+ when "sha1"
149
+ @revision.sha1 = @text_stack.last
150
+ end
151
+ pop_stacks
152
+ end
153
+
154
+ def text(data)
155
+ @text_stack.last << data
156
+ end
157
+
158
+ def cdata(contnet)
159
+ @text_stack.last << content
160
+ end
161
+
162
+ private
163
+ def on_page(page)
164
+ @block.call(page)
165
+ end
166
+
167
+ def push_stacks(tag)
168
+ @tag_stack << tag
169
+ @text_stack << ""
170
+ end
171
+
172
+ def pop_stacks
173
+ @text_stack.pop
174
+ @tag_stack.pop
175
+ end
176
+ end
177
+ end
178
+ end
data/lib/datasets.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  require "datasets/version"
2
2
 
3
3
  require "datasets/iris"
4
+ require "datasets/wikipedia"
data/test/helper.rb CHANGED
@@ -1,3 +1,19 @@
1
+ require "fileutils"
2
+ require "pathname"
3
+
1
4
  require "datasets"
2
5
 
3
6
  require "test-unit"
7
+
8
+ module Helper
9
+ module Sandbox
10
+ def setup_sandbox
11
+ @tmp_dir = (Pathname.new(__dir__) + "tmp").expand_path
12
+ FileUtils.mkdir_p(@tmp_dir)
13
+ end
14
+
15
+ def teardown_sandbox
16
+ FileUtils.rm_rf(@tmp_dir)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,98 @@
1
+ class WikipediaTest < Test::Unit::TestCase
2
+ sub_test_case("ja") do
3
+ sub_test_case("articles") do
4
+ include Helper::Sandbox
5
+
6
+ def setup
7
+ setup_sandbox
8
+ @dataset = Datasets::Wikipedia.new(language: :ja,
9
+ type: :articles)
10
+ def @dataset.cache_dir_path
11
+ @cache_dir_path
12
+ end
13
+ def @dataset.cache_dir_path=(path)
14
+ @cache_dir_path = path
15
+ end
16
+ @dataset.cache_dir_path = @tmp_dir
17
+ end
18
+
19
+ def teardown
20
+ teardown_sandbox
21
+ end
22
+
23
+ test("#each") do
24
+ def @dataset.download(output_path, url)
25
+ xml_path = output_path.sub_ext("")
26
+ xml_path.open("w") do |xml_file|
27
+ xml_file.puts(<<-XML)
28
+ <mediawiki
29
+ xmlns="http://www.mediawiki.org/xml/export-0.10/"
30
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
31
+ xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
32
+ version="0.10" xml:lang="ja">
33
+ <siteinfo>
34
+ <sitename>Wikipedia</sitename>
35
+ </siteinfo>
36
+ <page>
37
+ <title>タイトル</title>
38
+ <ns>4</ns>
39
+ <id>1</id>
40
+ <restrictions>sysop</restrictions>
41
+ <revision>
42
+ <id>3</id>
43
+ <parentid>2</parentid>
44
+ <timestamp>2004-04-30T14:46:00Z</timestamp>
45
+ <contributor>
46
+ <username>user</username>
47
+ <id>10</id>
48
+ </contributor>
49
+ <minor />
50
+ <comment>コメント</comment>
51
+ <model>wikitext</model>
52
+ <format>text/x-wiki</format>
53
+ <text xml:space="preserve">テキスト</text>
54
+ <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
55
+ </revision>
56
+ </page>
57
+ </mediawiki>
58
+ XML
59
+ end
60
+ unless system("bzip2", xml_path.to_s)
61
+ raise "failed to run bzip2"
62
+ end
63
+ end
64
+
65
+ contributor = Datasets::Wikipedia::Contributor.new("user", 10)
66
+ revision = Datasets::Wikipedia::Revision.new
67
+ revision.id = 3
68
+ revision.parent_id = 2
69
+ revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
70
+ revision.contributor = contributor
71
+ revision.comment = "コメント"
72
+ revision.model = "wikitext"
73
+ revision.format = "text/x-wiki"
74
+ revision.text = "テキスト"
75
+ revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
76
+ page = Datasets::Wikipedia::Page.new
77
+ page.title = "タイトル"
78
+ page.namespace = 4
79
+ page.id = 1
80
+ page.restrictions = ["sysop"]
81
+ page.revision = revision
82
+ assert_equal(page, @dataset.each.first)
83
+ end
84
+
85
+ sub_test_case("#metadata") do
86
+ test("#name") do
87
+ assert_equal("wikipedia-ja-articles",
88
+ @dataset.metadata.name)
89
+ end
90
+
91
+ test("#description") do
92
+ assert_equal("Wikipedia articles (ja)",
93
+ @dataset.metadata.description)
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-01-08 00:00:00.000000000 Z
12
+ date: 2018-02-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -100,13 +100,16 @@ files:
100
100
  - doc/text/news.md
101
101
  - lib/datasets.rb
102
102
  - lib/datasets/dataset.rb
103
+ - lib/datasets/downloader.rb
103
104
  - lib/datasets/iris.rb
104
105
  - lib/datasets/metadata.rb
105
106
  - lib/datasets/version.rb
107
+ - lib/datasets/wikipedia.rb
106
108
  - red-datasets.gemspec
107
109
  - test/helper.rb
108
110
  - test/run-test.rb
109
111
  - test/test-iris.rb
112
+ - test/test-wikipedia.rb
110
113
  homepage: https://github.com/red-data-tools/red-datasets
111
114
  licenses:
112
115
  - MIT
@@ -127,11 +130,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
127
130
  version: '0'
128
131
  requirements: []
129
132
  rubyforge_project:
130
- rubygems_version: 2.7.4
133
+ rubygems_version: 2.5.2.2
131
134
  signing_key:
132
135
  specification_version: 4
133
136
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
134
137
  test_files:
135
138
  - test/test-iris.rb
136
- - test/run-test.rb
139
+ - test/test-wikipedia.rb
137
140
  - test/helper.rb
141
+ - test/run-test.rb