red-datasets 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 21e6ccf743e4603d55a8c628213d3e8278b4c9a29f4914887cc4f148065e6edc
4
- data.tar.gz: 5d45f1ed6e5e2ec6620243641322323b78d22bb52c1f06ed3b0a0fe2555a7dd7
2
+ SHA1:
3
+ metadata.gz: 47a9f2cf4c17f8b64f0a88dc7738affbbcd316a0
4
+ data.tar.gz: 7255596f70ff903f9103b3d72e78799a622e626e
5
5
  SHA512:
6
- metadata.gz: 791bb9a4953a6c7667e95ff8b479180e91ce25f4f090a55a119118884caf59482ec6cff3993486427e85156228041c2ece0734312210079a7a057067d709d277
7
- data.tar.gz: 8f76e7a5d781d85767d1b476dac26e9b954d556e9e54d689b1cdbe0f881a8f165b6188cce072857b19b3f7c5f57537b82b962fb7c74a35102aad396cb9617ddd
6
+ metadata.gz: e4065c07b451443e7ea2ff395144ecce0d9fa09e2a48979ba3b35c0cd0f18e48b27858a608d9b52e42bb112072a8fee1bffabbad33572028e25ef4aa163fb4cd
7
+ data.tar.gz: 2e76c88a1bc14ffd4d4808d05a8c395415fa34dbb8b1189dbbe8e0afb8b573dd20b1f06338bd82f9cd01d413c1f0e83130748c7bffd3e623ac4e2bbe826bc9c2
data/doc/text/news.md CHANGED
@@ -1,6 +1,12 @@
1
1
  # News
2
2
 
3
- ## 0.0.1 2018-01-08
3
+ ## 0.0.2 - 2018-02-06
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::Wikipedia`: Added a dataset for Wikipedia.
8
+
9
+ ## 0.0.1 - 2018-01-08
4
10
 
5
11
  ### Improvements
6
12
 
@@ -1,7 +1,6 @@
1
- require "fileutils"
2
- require "open-uri"
3
1
  require "pathname"
4
2
 
3
+ require_relative "downloader"
5
4
  require_relative "metadata"
6
5
 
7
6
  module Datasets
@@ -25,18 +24,8 @@ module Datasets
25
24
  end
26
25
 
27
26
  def download(output_path, url)
28
- url = URI.parse(url) unless url.is_a?(URI::Generic)
29
- output_path.parent.mkpath
30
- begin
31
- url.open do |input|
32
- output_path.open("wb") do |output|
33
- IO.copy_stream(input, output)
34
- end
35
- end
36
- rescue
37
- FileUtils.rm_f(output_path)
38
- raise
39
- end
27
+ downloader = Downloader.new(url)
28
+ downloader.download(output_path)
40
29
  end
41
30
  end
42
31
  end
@@ -0,0 +1,64 @@
1
+ require "fileutils"
2
+ require "open-uri"
3
+
4
+ module Datasets
5
+ class Downloader
6
+ def initialize(url)
7
+ url = URI.parse(url) unless url.is_a?(URI::Generic)
8
+ @url = url
9
+ end
10
+
11
+ def download(output_path)
12
+ output_path.parent.mkpath
13
+
14
+ if $stderr == STDERR and $stderr.tty?
15
+ max = nil
16
+ base_name = @url.path.split("/").last
17
+ content_length_proc = lambda do |content_length|
18
+ max = content_length
19
+ end
20
+ progress_proc = lambda do |current|
21
+ if max
22
+ percent = (current / max.to_f) * 100
23
+ formatted_size = "[%s/%s]" % [format_size(current), format_size(max)]
24
+ $stderr.print("\r%s - %06.2f%% %s" %
25
+ [base_name, percent, formatted_size])
26
+ $stderr.puts if current == max
27
+ end
28
+ end
29
+ options = {
30
+ :content_length_proc => content_length_proc,
31
+ :progress_proc => progress_proc,
32
+ }
33
+ else
34
+ options = {}
35
+ end
36
+
37
+ begin
38
+ @url.open(options) do |input|
39
+ output_path.open("wb") do |output|
40
+ IO.copy_stream(input, output)
41
+ end
42
+ end
43
+ rescue
44
+ FileUtils.rm_f(output_path)
45
+ raise
46
+ end
47
+ end
48
+
49
+ private
50
+ def format_size(size)
51
+ if size < 1024
52
+ "%d" % size
53
+ elsif size < (1024 ** 2)
54
+ "%7.2fKiB" % (size.to_f / 1024)
55
+ elsif size < (1024 ** 3)
56
+ "%7.2fMiB" % (size.to_f / (1024 ** 2))
57
+ elsif size < (1024 ** 4)
58
+ "%7.2fGiB" % (size.to_f / (1024 ** 3))
59
+ else
60
+ "%.2fTiB" % (size.to_f / (1024 ** 4))
61
+ end
62
+ end
63
+ end
64
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,178 @@
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
4
+
5
+ require_relative "dataset"
6
+
7
+ module Datasets
8
+ class Wikipedia < Dataset
9
+ Contributor = Struct.new(:user_name,
10
+ :id)
11
+ Revision = Struct.new(:id,
12
+ :parent_id,
13
+ :timestamp,
14
+ :contributor,
15
+ :minor,
16
+ :comment,
17
+ :model,
18
+ :format,
19
+ :text,
20
+ :sha1)
21
+ Page = Struct.new(:title,
22
+ :namespace,
23
+ :id,
24
+ :restrictions,
25
+ :redirect,
26
+ :revision)
27
+
28
+ def initialize(language: :en,
29
+ type: :articles)
30
+ super()
31
+ @language = language
32
+ @type = type
33
+ @metadata.name = "wikipedia-#{@language}-#{@type}"
34
+ @metadata.url = "https://dumps.wikimedia.org/"
35
+ @metadata.description = "Wikipedia #{@type} (#{@language})"
36
+ end
37
+
38
+ def each(&block)
39
+ return to_enum(__method__) unless block_given?
40
+
41
+ open_data do |input|
42
+ listener = ArticlesListener.new(block)
43
+ parser = REXML::Parsers::StreamParser.new(input, listener)
44
+ parser.parse
45
+ end
46
+ end
47
+
48
+ private
49
+ def open_data
50
+ base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
51
+ data_path = cache_dir_path + base_name
52
+ unless data_path.exist?
53
+ data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
54
+ download(data_path, data_url)
55
+ end
56
+
57
+ input, output = IO.pipe
58
+ pid = spawn("bzcat", data_path.to_s, {:out => output})
59
+ begin
60
+ output.close
61
+ yield(input)
62
+ ensure
63
+ input.close
64
+ Process.waitpid(pid)
65
+ end
66
+ end
67
+
68
+ def type_in_path
69
+ case @type
70
+ when :articles
71
+ "pages-articles"
72
+ else
73
+ @type.to_s
74
+ end
75
+ end
76
+
77
+ class ArticlesListener
78
+ include REXML::StreamListener
79
+
80
+ def initialize(block)
81
+ @block = block
82
+ @page = nil
83
+ @revision = nil
84
+ @contributor = nil
85
+ @current_tag = nil
86
+ @tag_stack = []
87
+ @text_stack = [""]
88
+ @first_page = true
89
+ end
90
+
91
+ def tag_start(name, attributes)
92
+ push_stacks(name)
93
+ case name
94
+ when "page"
95
+ @page = Page.new
96
+ when "revision"
97
+ @revision = Revision.new
98
+ when "contributor"
99
+ @contributor = Contributor.new
100
+ when "redirect"
101
+ @page.redirect = attributes["title"]
102
+ end
103
+ end
104
+
105
+ def tag_end(name)
106
+ case name
107
+ when "page"
108
+ on_page(@page)
109
+ @page = nil
110
+ when "title"
111
+ @page.title = @text_stack.last
112
+ when "ns"
113
+ @page.namespace = Integer(@text_stack.last)
114
+ when "id"
115
+ id = Integer(@text_stack.last)
116
+ case @tag_stack[-2]
117
+ when "page"
118
+ @page.id = id
119
+ when "revision"
120
+ @revision.id = id
121
+ when "contributor"
122
+ @contributor.id = id
123
+ end
124
+ when "restrictions"
125
+ @page.restrictions = @text_stack.last.split(":")
126
+ when "revision"
127
+ @page.revision = @revision
128
+ @revision = nil
129
+ when "parentid"
130
+ @revision.parent_id = Integer(@text_stack.last)
131
+ when "timestamp"
132
+ @revision.timestamp = Time.iso8601(@text_stack.last)
133
+ when "contributor"
134
+ @revision.contributor = @contributor
135
+ @contributor = nil
136
+ when "username"
137
+ @contributor.user_name = @text_stack.last
138
+ when "minor"
139
+ # TODO
140
+ when "comment"
141
+ @revision.comment = @text_stack.last
142
+ when "model"
143
+ @revision.model = @text_stack.last
144
+ when "format"
145
+ @revision.format = @text_stack.last
146
+ when "text"
147
+ @revision.text = @text_stack.last
148
+ when "sha1"
149
+ @revision.sha1 = @text_stack.last
150
+ end
151
+ pop_stacks
152
+ end
153
+
154
+ def text(data)
155
+ @text_stack.last << data
156
+ end
157
+
158
+ def cdata(contnet)
159
+ @text_stack.last << content
160
+ end
161
+
162
+ private
163
+ def on_page(page)
164
+ @block.call(page)
165
+ end
166
+
167
+ def push_stacks(tag)
168
+ @tag_stack << tag
169
+ @text_stack << ""
170
+ end
171
+
172
+ def pop_stacks
173
+ @text_stack.pop
174
+ @tag_stack.pop
175
+ end
176
+ end
177
+ end
178
+ end
data/lib/datasets.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  require "datasets/version"
2
2
 
3
3
  require "datasets/iris"
4
+ require "datasets/wikipedia"
data/test/helper.rb CHANGED
@@ -1,3 +1,19 @@
1
+ require "fileutils"
2
+ require "pathname"
3
+
1
4
  require "datasets"
2
5
 
3
6
  require "test-unit"
7
+
8
+ module Helper
9
+ module Sandbox
10
+ def setup_sandbox
11
+ @tmp_dir = (Pathname.new(__dir__) + "tmp").expand_path
12
+ FileUtils.mkdir_p(@tmp_dir)
13
+ end
14
+
15
+ def teardown_sandbox
16
+ FileUtils.rm_rf(@tmp_dir)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,98 @@
1
+ class WikipediaTest < Test::Unit::TestCase
2
+ sub_test_case("ja") do
3
+ sub_test_case("articles") do
4
+ include Helper::Sandbox
5
+
6
+ def setup
7
+ setup_sandbox
8
+ @dataset = Datasets::Wikipedia.new(language: :ja,
9
+ type: :articles)
10
+ def @dataset.cache_dir_path
11
+ @cache_dir_path
12
+ end
13
+ def @dataset.cache_dir_path=(path)
14
+ @cache_dir_path = path
15
+ end
16
+ @dataset.cache_dir_path = @tmp_dir
17
+ end
18
+
19
+ def teardown
20
+ teardown_sandbox
21
+ end
22
+
23
+ test("#each") do
24
+ def @dataset.download(output_path, url)
25
+ xml_path = output_path.sub_ext("")
26
+ xml_path.open("w") do |xml_file|
27
+ xml_file.puts(<<-XML)
28
+ <mediawiki
29
+ xmlns="http://www.mediawiki.org/xml/export-0.10/"
30
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
31
+ xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
32
+ version="0.10" xml:lang="ja">
33
+ <siteinfo>
34
+ <sitename>Wikipedia</sitename>
35
+ </siteinfo>
36
+ <page>
37
+ <title>タイトル</title>
38
+ <ns>4</ns>
39
+ <id>1</id>
40
+ <restrictions>sysop</restrictions>
41
+ <revision>
42
+ <id>3</id>
43
+ <parentid>2</parentid>
44
+ <timestamp>2004-04-30T14:46:00Z</timestamp>
45
+ <contributor>
46
+ <username>user</username>
47
+ <id>10</id>
48
+ </contributor>
49
+ <minor />
50
+ <comment>コメント</comment>
51
+ <model>wikitext</model>
52
+ <format>text/x-wiki</format>
53
+ <text xml:space="preserve">テキスト</text>
54
+ <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
55
+ </revision>
56
+ </page>
57
+ </mediawiki>
58
+ XML
59
+ end
60
+ unless system("bzip2", xml_path.to_s)
61
+ raise "failed to run bzip2"
62
+ end
63
+ end
64
+
65
+ contributor = Datasets::Wikipedia::Contributor.new("user", 10)
66
+ revision = Datasets::Wikipedia::Revision.new
67
+ revision.id = 3
68
+ revision.parent_id = 2
69
+ revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
70
+ revision.contributor = contributor
71
+ revision.comment = "コメント"
72
+ revision.model = "wikitext"
73
+ revision.format = "text/x-wiki"
74
+ revision.text = "テキスト"
75
+ revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
76
+ page = Datasets::Wikipedia::Page.new
77
+ page.title = "タイトル"
78
+ page.namespace = 4
79
+ page.id = 1
80
+ page.restrictions = ["sysop"]
81
+ page.revision = revision
82
+ assert_equal(page, @dataset.each.first)
83
+ end
84
+
85
+ sub_test_case("#metadata") do
86
+ test("#name") do
87
+ assert_equal("wikipedia-ja-articles",
88
+ @dataset.metadata.name)
89
+ end
90
+
91
+ test("#description") do
92
+ assert_equal("Wikipedia articles (ja)",
93
+ @dataset.metadata.description)
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-01-08 00:00:00.000000000 Z
12
+ date: 2018-02-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -100,13 +100,16 @@ files:
100
100
  - doc/text/news.md
101
101
  - lib/datasets.rb
102
102
  - lib/datasets/dataset.rb
103
+ - lib/datasets/downloader.rb
103
104
  - lib/datasets/iris.rb
104
105
  - lib/datasets/metadata.rb
105
106
  - lib/datasets/version.rb
107
+ - lib/datasets/wikipedia.rb
106
108
  - red-datasets.gemspec
107
109
  - test/helper.rb
108
110
  - test/run-test.rb
109
111
  - test/test-iris.rb
112
+ - test/test-wikipedia.rb
110
113
  homepage: https://github.com/red-data-tools/red-datasets
111
114
  licenses:
112
115
  - MIT
@@ -127,11 +130,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
127
130
  version: '0'
128
131
  requirements: []
129
132
  rubyforge_project:
130
- rubygems_version: 2.7.4
133
+ rubygems_version: 2.5.2.2
131
134
  signing_key:
132
135
  specification_version: 4
133
136
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
134
137
  test_files:
135
138
  - test/test-iris.rb
136
- - test/run-test.rb
139
+ - test/test-wikipedia.rb
137
140
  - test/helper.rb
141
+ - test/run-test.rb