red-datasets 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/doc/text/news.md +7 -1
- data/lib/datasets/dataset.rb +3 -14
- data/lib/datasets/downloader.rb +64 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +178 -0
- data/lib/datasets.rb +1 -0
- data/test/helper.rb +16 -0
- data/test/test-wikipedia.rb +98 -0
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 47a9f2cf4c17f8b64f0a88dc7738affbbcd316a0
|
4
|
+
data.tar.gz: 7255596f70ff903f9103b3d72e78799a622e626e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e4065c07b451443e7ea2ff395144ecce0d9fa09e2a48979ba3b35c0cd0f18e48b27858a608d9b52e42bb112072a8fee1bffabbad33572028e25ef4aa163fb4cd
|
7
|
+
data.tar.gz: 2e76c88a1bc14ffd4d4808d05a8c395415fa34dbb8b1189dbbe8e0afb8b573dd20b1f06338bd82f9cd01d413c1f0e83130748c7bffd3e623ac4e2bbe826bc9c2
|
data/doc/text/news.md
CHANGED
data/lib/datasets/dataset.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
|
-
require "fileutils"
|
2
|
-
require "open-uri"
|
3
1
|
require "pathname"
|
4
2
|
|
3
|
+
require_relative "downloader"
|
5
4
|
require_relative "metadata"
|
6
5
|
|
7
6
|
module Datasets
|
@@ -25,18 +24,8 @@ module Datasets
|
|
25
24
|
end
|
26
25
|
|
27
26
|
def download(output_path, url)
|
28
|
-
|
29
|
-
output_path
|
30
|
-
begin
|
31
|
-
url.open do |input|
|
32
|
-
output_path.open("wb") do |output|
|
33
|
-
IO.copy_stream(input, output)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
rescue
|
37
|
-
FileUtils.rm_f(output_path)
|
38
|
-
raise
|
39
|
-
end
|
27
|
+
downloader = Downloader.new(url)
|
28
|
+
downloader.download(output_path)
|
40
29
|
end
|
41
30
|
end
|
42
31
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
require "open-uri"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class Downloader
|
6
|
+
def initialize(url)
|
7
|
+
url = URI.parse(url) unless url.is_a?(URI::Generic)
|
8
|
+
@url = url
|
9
|
+
end
|
10
|
+
|
11
|
+
def download(output_path)
|
12
|
+
output_path.parent.mkpath
|
13
|
+
|
14
|
+
if $stderr == STDERR and $stderr.tty?
|
15
|
+
max = nil
|
16
|
+
base_name = @url.path.split("/").last
|
17
|
+
content_length_proc = lambda do |content_length|
|
18
|
+
max = content_length
|
19
|
+
end
|
20
|
+
progress_proc = lambda do |current|
|
21
|
+
if max
|
22
|
+
percent = (current / max.to_f) * 100
|
23
|
+
formatted_size = "[%s/%s]" % [format_size(current), format_size(max)]
|
24
|
+
$stderr.print("\r%s - %06.2f%% %s" %
|
25
|
+
[base_name, percent, formatted_size])
|
26
|
+
$stderr.puts if current == max
|
27
|
+
end
|
28
|
+
end
|
29
|
+
options = {
|
30
|
+
:content_length_proc => content_length_proc,
|
31
|
+
:progress_proc => progress_proc,
|
32
|
+
}
|
33
|
+
else
|
34
|
+
options = {}
|
35
|
+
end
|
36
|
+
|
37
|
+
begin
|
38
|
+
@url.open(options) do |input|
|
39
|
+
output_path.open("wb") do |output|
|
40
|
+
IO.copy_stream(input, output)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
rescue
|
44
|
+
FileUtils.rm_f(output_path)
|
45
|
+
raise
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def format_size(size)
|
51
|
+
if size < 1024
|
52
|
+
"%d" % size
|
53
|
+
elsif size < (1024 ** 2)
|
54
|
+
"%7.2fKiB" % (size.to_f / 1024)
|
55
|
+
elsif size < (1024 ** 3)
|
56
|
+
"%7.2fMiB" % (size.to_f / (1024 ** 2))
|
57
|
+
elsif size < (1024 ** 4)
|
58
|
+
"%7.2fGiB" % (size.to_f / (1024 ** 3))
|
59
|
+
else
|
60
|
+
"%.2fTiB" % (size.to_f / (1024 ** 4))
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/datasets/version.rb
CHANGED
@@ -0,0 +1,178 @@
|
|
1
|
+
require "rexml/streamlistener"
|
2
|
+
require "rexml/parsers/baseparser"
|
3
|
+
require "rexml/parsers/streamparser"
|
4
|
+
|
5
|
+
require_relative "dataset"
|
6
|
+
|
7
|
+
module Datasets
|
8
|
+
class Wikipedia < Dataset
|
9
|
+
Contributor = Struct.new(:user_name,
|
10
|
+
:id)
|
11
|
+
Revision = Struct.new(:id,
|
12
|
+
:parent_id,
|
13
|
+
:timestamp,
|
14
|
+
:contributor,
|
15
|
+
:minor,
|
16
|
+
:comment,
|
17
|
+
:model,
|
18
|
+
:format,
|
19
|
+
:text,
|
20
|
+
:sha1)
|
21
|
+
Page = Struct.new(:title,
|
22
|
+
:namespace,
|
23
|
+
:id,
|
24
|
+
:restrictions,
|
25
|
+
:redirect,
|
26
|
+
:revision)
|
27
|
+
|
28
|
+
def initialize(language: :en,
|
29
|
+
type: :articles)
|
30
|
+
super()
|
31
|
+
@language = language
|
32
|
+
@type = type
|
33
|
+
@metadata.name = "wikipedia-#{@language}-#{@type}"
|
34
|
+
@metadata.url = "https://dumps.wikimedia.org/"
|
35
|
+
@metadata.description = "Wikipedia #{@type} (#{@language})"
|
36
|
+
end
|
37
|
+
|
38
|
+
def each(&block)
|
39
|
+
return to_enum(__method__) unless block_given?
|
40
|
+
|
41
|
+
open_data do |input|
|
42
|
+
listener = ArticlesListener.new(block)
|
43
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
44
|
+
parser.parse
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def open_data
|
50
|
+
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
51
|
+
data_path = cache_dir_path + base_name
|
52
|
+
unless data_path.exist?
|
53
|
+
data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
|
54
|
+
download(data_path, data_url)
|
55
|
+
end
|
56
|
+
|
57
|
+
input, output = IO.pipe
|
58
|
+
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
59
|
+
begin
|
60
|
+
output.close
|
61
|
+
yield(input)
|
62
|
+
ensure
|
63
|
+
input.close
|
64
|
+
Process.waitpid(pid)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def type_in_path
|
69
|
+
case @type
|
70
|
+
when :articles
|
71
|
+
"pages-articles"
|
72
|
+
else
|
73
|
+
@type.to_s
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class ArticlesListener
|
78
|
+
include REXML::StreamListener
|
79
|
+
|
80
|
+
def initialize(block)
|
81
|
+
@block = block
|
82
|
+
@page = nil
|
83
|
+
@revision = nil
|
84
|
+
@contributor = nil
|
85
|
+
@current_tag = nil
|
86
|
+
@tag_stack = []
|
87
|
+
@text_stack = [""]
|
88
|
+
@first_page = true
|
89
|
+
end
|
90
|
+
|
91
|
+
def tag_start(name, attributes)
|
92
|
+
push_stacks(name)
|
93
|
+
case name
|
94
|
+
when "page"
|
95
|
+
@page = Page.new
|
96
|
+
when "revision"
|
97
|
+
@revision = Revision.new
|
98
|
+
when "contributor"
|
99
|
+
@contributor = Contributor.new
|
100
|
+
when "redirect"
|
101
|
+
@page.redirect = attributes["title"]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def tag_end(name)
|
106
|
+
case name
|
107
|
+
when "page"
|
108
|
+
on_page(@page)
|
109
|
+
@page = nil
|
110
|
+
when "title"
|
111
|
+
@page.title = @text_stack.last
|
112
|
+
when "ns"
|
113
|
+
@page.namespace = Integer(@text_stack.last)
|
114
|
+
when "id"
|
115
|
+
id = Integer(@text_stack.last)
|
116
|
+
case @tag_stack[-2]
|
117
|
+
when "page"
|
118
|
+
@page.id = id
|
119
|
+
when "revision"
|
120
|
+
@revision.id = id
|
121
|
+
when "contributor"
|
122
|
+
@contributor.id = id
|
123
|
+
end
|
124
|
+
when "restrictions"
|
125
|
+
@page.restrictions = @text_stack.last.split(":")
|
126
|
+
when "revision"
|
127
|
+
@page.revision = @revision
|
128
|
+
@revision = nil
|
129
|
+
when "parentid"
|
130
|
+
@revision.parent_id = Integer(@text_stack.last)
|
131
|
+
when "timestamp"
|
132
|
+
@revision.timestamp = Time.iso8601(@text_stack.last)
|
133
|
+
when "contributor"
|
134
|
+
@revision.contributor = @contributor
|
135
|
+
@contributor = nil
|
136
|
+
when "username"
|
137
|
+
@contributor.user_name = @text_stack.last
|
138
|
+
when "minor"
|
139
|
+
# TODO
|
140
|
+
when "comment"
|
141
|
+
@revision.comment = @text_stack.last
|
142
|
+
when "model"
|
143
|
+
@revision.model = @text_stack.last
|
144
|
+
when "format"
|
145
|
+
@revision.format = @text_stack.last
|
146
|
+
when "text"
|
147
|
+
@revision.text = @text_stack.last
|
148
|
+
when "sha1"
|
149
|
+
@revision.sha1 = @text_stack.last
|
150
|
+
end
|
151
|
+
pop_stacks
|
152
|
+
end
|
153
|
+
|
154
|
+
def text(data)
|
155
|
+
@text_stack.last << data
|
156
|
+
end
|
157
|
+
|
158
|
+
def cdata(contnet)
|
159
|
+
@text_stack.last << content
|
160
|
+
end
|
161
|
+
|
162
|
+
private
|
163
|
+
def on_page(page)
|
164
|
+
@block.call(page)
|
165
|
+
end
|
166
|
+
|
167
|
+
def push_stacks(tag)
|
168
|
+
@tag_stack << tag
|
169
|
+
@text_stack << ""
|
170
|
+
end
|
171
|
+
|
172
|
+
def pop_stacks
|
173
|
+
@text_stack.pop
|
174
|
+
@tag_stack.pop
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
data/lib/datasets.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
require "pathname"
|
3
|
+
|
1
4
|
require "datasets"
|
2
5
|
|
3
6
|
require "test-unit"
|
7
|
+
|
8
|
+
module Helper
|
9
|
+
module Sandbox
|
10
|
+
def setup_sandbox
|
11
|
+
@tmp_dir = (Pathname.new(__dir__) + "tmp").expand_path
|
12
|
+
FileUtils.mkdir_p(@tmp_dir)
|
13
|
+
end
|
14
|
+
|
15
|
+
def teardown_sandbox
|
16
|
+
FileUtils.rm_rf(@tmp_dir)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
class WikipediaTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("ja") do
|
3
|
+
sub_test_case("articles") do
|
4
|
+
include Helper::Sandbox
|
5
|
+
|
6
|
+
def setup
|
7
|
+
setup_sandbox
|
8
|
+
@dataset = Datasets::Wikipedia.new(language: :ja,
|
9
|
+
type: :articles)
|
10
|
+
def @dataset.cache_dir_path
|
11
|
+
@cache_dir_path
|
12
|
+
end
|
13
|
+
def @dataset.cache_dir_path=(path)
|
14
|
+
@cache_dir_path = path
|
15
|
+
end
|
16
|
+
@dataset.cache_dir_path = @tmp_dir
|
17
|
+
end
|
18
|
+
|
19
|
+
def teardown
|
20
|
+
teardown_sandbox
|
21
|
+
end
|
22
|
+
|
23
|
+
test("#each") do
|
24
|
+
def @dataset.download(output_path, url)
|
25
|
+
xml_path = output_path.sub_ext("")
|
26
|
+
xml_path.open("w") do |xml_file|
|
27
|
+
xml_file.puts(<<-XML)
|
28
|
+
<mediawiki
|
29
|
+
xmlns="http://www.mediawiki.org/xml/export-0.10/"
|
30
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
31
|
+
xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
|
32
|
+
version="0.10" xml:lang="ja">
|
33
|
+
<siteinfo>
|
34
|
+
<sitename>Wikipedia</sitename>
|
35
|
+
</siteinfo>
|
36
|
+
<page>
|
37
|
+
<title>タイトル</title>
|
38
|
+
<ns>4</ns>
|
39
|
+
<id>1</id>
|
40
|
+
<restrictions>sysop</restrictions>
|
41
|
+
<revision>
|
42
|
+
<id>3</id>
|
43
|
+
<parentid>2</parentid>
|
44
|
+
<timestamp>2004-04-30T14:46:00Z</timestamp>
|
45
|
+
<contributor>
|
46
|
+
<username>user</username>
|
47
|
+
<id>10</id>
|
48
|
+
</contributor>
|
49
|
+
<minor />
|
50
|
+
<comment>コメント</comment>
|
51
|
+
<model>wikitext</model>
|
52
|
+
<format>text/x-wiki</format>
|
53
|
+
<text xml:space="preserve">テキスト</text>
|
54
|
+
<sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
|
55
|
+
</revision>
|
56
|
+
</page>
|
57
|
+
</mediawiki>
|
58
|
+
XML
|
59
|
+
end
|
60
|
+
unless system("bzip2", xml_path.to_s)
|
61
|
+
raise "failed to run bzip2"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
contributor = Datasets::Wikipedia::Contributor.new("user", 10)
|
66
|
+
revision = Datasets::Wikipedia::Revision.new
|
67
|
+
revision.id = 3
|
68
|
+
revision.parent_id = 2
|
69
|
+
revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
|
70
|
+
revision.contributor = contributor
|
71
|
+
revision.comment = "コメント"
|
72
|
+
revision.model = "wikitext"
|
73
|
+
revision.format = "text/x-wiki"
|
74
|
+
revision.text = "テキスト"
|
75
|
+
revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
|
76
|
+
page = Datasets::Wikipedia::Page.new
|
77
|
+
page.title = "タイトル"
|
78
|
+
page.namespace = 4
|
79
|
+
page.id = 1
|
80
|
+
page.restrictions = ["sysop"]
|
81
|
+
page.revision = revision
|
82
|
+
assert_equal(page, @dataset.each.first)
|
83
|
+
end
|
84
|
+
|
85
|
+
sub_test_case("#metadata") do
|
86
|
+
test("#name") do
|
87
|
+
assert_equal("wikipedia-ja-articles",
|
88
|
+
@dataset.metadata.name)
|
89
|
+
end
|
90
|
+
|
91
|
+
test("#description") do
|
92
|
+
assert_equal("Wikipedia articles (ja)",
|
93
|
+
@dataset.metadata.description)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-02-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -100,13 +100,16 @@ files:
|
|
100
100
|
- doc/text/news.md
|
101
101
|
- lib/datasets.rb
|
102
102
|
- lib/datasets/dataset.rb
|
103
|
+
- lib/datasets/downloader.rb
|
103
104
|
- lib/datasets/iris.rb
|
104
105
|
- lib/datasets/metadata.rb
|
105
106
|
- lib/datasets/version.rb
|
107
|
+
- lib/datasets/wikipedia.rb
|
106
108
|
- red-datasets.gemspec
|
107
109
|
- test/helper.rb
|
108
110
|
- test/run-test.rb
|
109
111
|
- test/test-iris.rb
|
112
|
+
- test/test-wikipedia.rb
|
110
113
|
homepage: https://github.com/red-data-tools/red-datasets
|
111
114
|
licenses:
|
112
115
|
- MIT
|
@@ -127,11 +130,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
127
130
|
version: '0'
|
128
131
|
requirements: []
|
129
132
|
rubyforge_project:
|
130
|
-
rubygems_version: 2.
|
133
|
+
rubygems_version: 2.5.2.2
|
131
134
|
signing_key:
|
132
135
|
specification_version: 4
|
133
136
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
134
137
|
test_files:
|
135
138
|
- test/test-iris.rb
|
136
|
-
- test/
|
139
|
+
- test/test-wikipedia.rb
|
137
140
|
- test/helper.rb
|
141
|
+
- test/run-test.rb
|