wikiavro 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/wikiavro +90 -0
- data/lib/wikiavro.rb +6 -0
- data/lib/wikiavro/avro.rb +199 -0
- data/lib/wikiavro/mediawiki.rb +671 -0
- data/lib/wikiavro/xml.rb +282 -0
- metadata +77 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c64ef5f767535d9432d8f589411f76dedb6a4b04
|
4
|
+
data.tar.gz: b846af138fe797fa09e9f248bcdb3298e988b5fb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b5f7cba59679bc913ceaddf95e97f8ed7c63d9f54660d81680db419da09abf2c61ddadd2f2a75b0e199fbfe9f710363edcd889550f76f44acdd6a34e591f69a2
|
7
|
+
data.tar.gz: 8b63a6ae13ce31cc31eb2f5698a0bd586c9645dfbce1be6ae52e45f90315af3015ed1e1f17badaa4f06425483369a369af6adefd0dae2c8ef0555d8f05193f01
|
data/bin/wikiavro
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'wikiavro'
|
4
|
+
require 'xml'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
Version = WikiAvro::Version
|
8
|
+
|
9
|
+
logger = WikiAvro::MediaWiki::RevisionProgress.new 10000
|
10
|
+
overwrite = false
|
11
|
+
deflate = false
|
12
|
+
ns = nil
|
13
|
+
page = nil
|
14
|
+
rev = nil
|
15
|
+
lqt = nil
|
16
|
+
|
17
|
+
opt_parser = OptionParser.new do |opts|
|
18
|
+
opts.banner = 'Usage: wikiavro [options] [INFILE]'
|
19
|
+
|
20
|
+
opts.on('-q', '--quiet', "Don't report progress") do |v|
|
21
|
+
logger = WikiAvro::MediaWiki::NoProgress.new
|
22
|
+
end
|
23
|
+
|
24
|
+
opts.on('-v LEVEL', '--verbose=LEVEL', Integer,
|
25
|
+
'Report progress after every LEVEL revisions parsed') do |lvl|
|
26
|
+
logger = WikiAvro::MediaWiki::RevisionProgress.new lvl.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on('-o', '--overwrite') do |o|
|
30
|
+
overwrite = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on('-d', '--deflate', 'Enable Avro internal compression') do |d|
|
34
|
+
deflate = d
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on('-n OUTFILE', '--namespaces=OUTFILE') do |path|
|
38
|
+
ns = path
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on('-p OUTFILE', '--pages=OUTFILE') do |path|
|
42
|
+
page = path
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on('-r OUTFILE', '--revisions=OUTFILE') do |path|
|
46
|
+
rev = path
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on('-l OUTFILE', '--liquidthreads=OUTFILE') do |path|
|
50
|
+
lqt = path
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
opt_parser.parse!
|
55
|
+
|
56
|
+
abort('You must specify --namespaces') if ns.nil?
|
57
|
+
abort('You must specify --pages') if page.nil?
|
58
|
+
abort('You must specify --revisions') if rev.nil?
|
59
|
+
abort('You must specify --liquidthreads') if lqt.nil?
|
60
|
+
|
61
|
+
if !overwrite
|
62
|
+
[ns, page, rev, lqt].each do |path|
|
63
|
+
if File.exists? path
|
64
|
+
abort("#{path} already exists! pass --overwrite to proceed anyway")
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
ns = WikiAvro::Avro::NamespaceWriter.new ns, deflate
|
70
|
+
page = WikiAvro::Avro::PageWriter.new page, deflate
|
71
|
+
rev = WikiAvro::Avro::RevisionWriter.new rev, deflate
|
72
|
+
lqt = WikiAvro::Avro::LqtWriter.new lqt, deflate
|
73
|
+
|
74
|
+
writer = WikiAvro::MediaWiki::WikiWriter.new :logger => logger,
|
75
|
+
:namespace => ns,
|
76
|
+
:page => page,
|
77
|
+
:revision => rev,
|
78
|
+
:lqt => lqt
|
79
|
+
|
80
|
+
mw = WikiAvro::MediaWiki::WikiDump.new
|
81
|
+
|
82
|
+
xml = XML::Reader.io(ARGF)
|
83
|
+
xml.read
|
84
|
+
|
85
|
+
mw.parse(writer, nil, xml)
|
86
|
+
|
87
|
+
ns.close
|
88
|
+
page.close
|
89
|
+
rev.close
|
90
|
+
lqt.close
|
data/lib/wikiavro.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
require 'avro'
|
2
|
+
|
3
|
+
module WikiAvro::Avro
|
4
|
+
NAMESPACE_SCHEMA = <<-EOS
|
5
|
+
{
|
6
|
+
"namespace": "org.rationalwiki",
|
7
|
+
"name": "Namespace",
|
8
|
+
"type": "record",
|
9
|
+
"fields": [
|
10
|
+
{"name": "key", "type": "int"},
|
11
|
+
{"name": "case", "type": "string"},
|
12
|
+
{"name": "name", "type": "string"}
|
13
|
+
]
|
14
|
+
}
|
15
|
+
EOS
|
16
|
+
|
17
|
+
PAGE_SCHEMA = <<-EOS
|
18
|
+
{
|
19
|
+
"namespace": "org.rationalwiki",
|
20
|
+
"name": "Page",
|
21
|
+
"type": "record",
|
22
|
+
"fields": [
|
23
|
+
{"name": "id", "type": "long"},
|
24
|
+
{"name": "ns", "type": "long"},
|
25
|
+
{"name": "title", "type": "string"},
|
26
|
+
{"name": "redirect", "type": ["null", "string"]},
|
27
|
+
{"name": "sha1", "type": ["null", "string"]}
|
28
|
+
]
|
29
|
+
}
|
30
|
+
EOS
|
31
|
+
|
32
|
+
REVISION_SCHEMA = <<-EOS
|
33
|
+
{
|
34
|
+
"namespace": "org.rationalwiki",
|
35
|
+
"name": "Revision",
|
36
|
+
"type": "record",
|
37
|
+
"fields": [
|
38
|
+
{"name": "id", "type": "long"},
|
39
|
+
{"name": "page_id", "type": "long"},
|
40
|
+
{"name": "n", "type": "long"},
|
41
|
+
{"name": "timestamp", "type": "string"},
|
42
|
+
{"name": "contributor", "type": ["null", {
|
43
|
+
"namespace": "org.rationalwiki",
|
44
|
+
"name": "Contributor",
|
45
|
+
"type": "record",
|
46
|
+
"fields": [
|
47
|
+
{"name": "id", "type": ["null", "long"]},
|
48
|
+
{"name": "username", "type": ["null", "string"]},
|
49
|
+
{"name": "ip", "type": ["null", "string"]}
|
50
|
+
]
|
51
|
+
}]},
|
52
|
+
{"name": "minor", "type": "boolean"},
|
53
|
+
{"name": "comment", "type": ["null", "string"]},
|
54
|
+
{"name": "bytes", "type": "long"},
|
55
|
+
{"name": "textid", "type": ["null", "string"]},
|
56
|
+
{"name": "text", "type": ["null", "string"]}
|
57
|
+
]
|
58
|
+
}
|
59
|
+
EOS
|
60
|
+
|
61
|
+
LQT_SCHEMA = <<-EOS
|
62
|
+
{
|
63
|
+
"namespace": "org.rationalwiki",
|
64
|
+
"name": "Threading",
|
65
|
+
"type": "record",
|
66
|
+
"fields": [
|
67
|
+
{"name": "subject", "type": "string"},
|
68
|
+
{"name": "parent", "type": ["null", "long"]},
|
69
|
+
{"name": "ancestor", "type": ["null", "long"]},
|
70
|
+
{"name": "page", "type": "string"},
|
71
|
+
{"name": "id", "type": "long"},
|
72
|
+
{"name": "summary_page", "type": ["null", "string"]},
|
73
|
+
{"name": "author", "type": "string"},
|
74
|
+
{"name": "edit_status", "type": "string"},
|
75
|
+
{"name": "type", "type": "string"},
|
76
|
+
{"name": "signature", "type": ["null", "string"]}
|
77
|
+
]
|
78
|
+
}
|
79
|
+
EOS
|
80
|
+
|
81
|
+
class AvroWriter
|
82
|
+
def schema
|
83
|
+
raise NotImplementedError
|
84
|
+
end
|
85
|
+
|
86
|
+
def close
|
87
|
+
@writer.close
|
88
|
+
end
|
89
|
+
|
90
|
+
protected
|
91
|
+
|
92
|
+
def encode(data)
|
93
|
+
@writer << data
|
94
|
+
end
|
95
|
+
|
96
|
+
def initialize(path, deflate=false)
|
97
|
+
if !deflate
|
98
|
+
@writer = Avro::DataFile.open(path, 'w', schema)
|
99
|
+
else
|
100
|
+
@writer = Avro::DataFile.open(path, 'w', schema, 'deflate')
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
class NamespaceWriter < AvroWriter
|
106
|
+
def schema
|
107
|
+
NAMESPACE_SCHEMA
|
108
|
+
end
|
109
|
+
|
110
|
+
def write(key, casetype, name)
|
111
|
+
encode 'key' => key.to_i,
|
112
|
+
'case' => casetype,
|
113
|
+
'name' => name || ''
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
class PageWriter < AvroWriter
|
118
|
+
def schema
|
119
|
+
PAGE_SCHEMA
|
120
|
+
end
|
121
|
+
|
122
|
+
def write(ns, id, title, redirect, sha1)
|
123
|
+
encode 'id' => id.to_i,
|
124
|
+
'ns' => ns.to_i,
|
125
|
+
'title' => title,
|
126
|
+
'redirect' => redirect,
|
127
|
+
'sha1' => sha1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
class RevisionWriter < AvroWriter
|
132
|
+
def schema
|
133
|
+
REVISION_SCHEMA
|
134
|
+
end
|
135
|
+
|
136
|
+
def write(id, page_id, n, timestamp, contributor, minor,
|
137
|
+
comment, text_deleted, bytes, textid, text)
|
138
|
+
if !contributor[:deleted].nil? && !(contributor[:id].nil? &&
|
139
|
+
contributor[:username].nil? &&
|
140
|
+
contributor[:ip].nil?)
|
141
|
+
raise 'deleted contributor has content'
|
142
|
+
end
|
143
|
+
|
144
|
+
if contributor[:deleted].nil?
|
145
|
+
contributor.delete :deleted
|
146
|
+
contributor = {
|
147
|
+
'username' => contributor[:username],
|
148
|
+
'id' => contributor[:id].to_i,
|
149
|
+
'ip' => contributor[:ip]
|
150
|
+
}
|
151
|
+
else
|
152
|
+
contributor = nil
|
153
|
+
end
|
154
|
+
|
155
|
+
if comment[:deleted].nil?
|
156
|
+
comment = comment[:comment]
|
157
|
+
else
|
158
|
+
raise 'deleted comment has content' if comment[:comment]
|
159
|
+
comment = nil
|
160
|
+
end
|
161
|
+
|
162
|
+
text = nil if !text_deleted.nil?
|
163
|
+
|
164
|
+
encode 'id' => id.to_i,
|
165
|
+
'page_id' => page_id.to_i,
|
166
|
+
'n' => n.to_i,
|
167
|
+
'timestamp' => timestamp,
|
168
|
+
'contributor' => contributor,
|
169
|
+
'minor' => minor.nil?,
|
170
|
+
'comment' => comment,
|
171
|
+
'bytes' => bytes.to_i,
|
172
|
+
'textid' => textid,
|
173
|
+
'text' => text
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class LqtWriter < AvroWriter
|
178
|
+
def schema
|
179
|
+
LQT_SCHEMA
|
180
|
+
end
|
181
|
+
|
182
|
+
def write(subject, parent, ancestor, page, id, summary_page,
|
183
|
+
author, edit_status, type, signature)
|
184
|
+
parent = parent.to_i if parent
|
185
|
+
ancestor = ancestor.to_i if ancestor
|
186
|
+
|
187
|
+
encode 'subject' => subject,
|
188
|
+
'parent' => parent,
|
189
|
+
'ancestor' => ancestor,
|
190
|
+
'page' => page,
|
191
|
+
'id' => id.to_i,
|
192
|
+
'summary_page' => summary_page,
|
193
|
+
'author' => author,
|
194
|
+
'edit_status' => edit_status,
|
195
|
+
'type' => type,
|
196
|
+
'signature' => signature
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,671 @@
|
|
1
|
+
require 'wikiavro/xml'
|
2
|
+
|
3
|
+
# RW declares schema 0.6 but has redirects as in 0.7
|
4
|
+
#
|
5
|
+
# RW has <sha1> tags right after <redirect> - they are indented to a
|
6
|
+
# different level too. The <sha1>s are missing within <revision>s
|
7
|
+
# where they should be.
|
8
|
+
#
|
9
|
+
# Schema claims discussionthreadinginfo, but actual tag is
|
10
|
+
# DiscussionThreading. Schema does not describe ThreadSummaryPage or
|
11
|
+
# ThreadSignature. Schema does not describe which LQT tags are
|
12
|
+
# omissible. Schema says thread info should always come after
|
13
|
+
# revisions, but it does not.
|
14
|
+
|
15
|
+
module WikiAvro::MediaWiki
|
16
|
+
class NamespacePrinter
|
17
|
+
def write(key, casetype, name)
|
18
|
+
puts "namespace #{key}: \"#{name}\" #{casetype}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class PagePrinter
|
23
|
+
def write(ns, id, title, redirect, sha1)
|
24
|
+
puts "page \"#{title}\": #{id} #{ns} #{redirect} #{sha1}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class RevisionPrinter
|
29
|
+
def write(id, page_id, n, timestamp, contributor, minor,
|
30
|
+
comment, text_deleted, bytes, textid, text)
|
31
|
+
puts "rev #{page_id} #{n}: #{timestamp} " +
|
32
|
+
"#{bytes} #{contributor[:username]}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class LqtPrinter
|
37
|
+
def write(threadSubject, threadParent, threadAncestor,
|
38
|
+
threadPage, threadID, threadSummaryPage,
|
39
|
+
threadAuthor, threadEditStatus, threadType,
|
40
|
+
threadSignature)
|
41
|
+
puts "thread #{threadSubject} #{threadParent} #{threadAncestor} " +
|
42
|
+
"#{threadAuthor} #{threadEditStatus} #{threadType}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class NullWriter
|
47
|
+
def method_missing(target, *args, &block)
|
48
|
+
# All methods return nil
|
49
|
+
end
|
50
|
+
|
51
|
+
def initialize
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class WikiWriter
|
56
|
+
def initialize(writers)
|
57
|
+
null_writer = NullWriter.new
|
58
|
+
@namespace = (writers[:namespace] or NamespacePrinter.new)
|
59
|
+
@logger = (writers[:logger] or NoProgress.new)
|
60
|
+
@page = (writers[:page] or NullWriter.new)
|
61
|
+
@revision = (writers[:revision] or NullWriter.new)
|
62
|
+
@lqt = (writers[:lqt] or NullWriter.new)
|
63
|
+
end
|
64
|
+
|
65
|
+
def namespace(key, casetype, name)
|
66
|
+
@namespace.write(key, casetype, name)
|
67
|
+
end
|
68
|
+
|
69
|
+
def page(ns, id, title, redirect, sha1)
|
70
|
+
@logger.report_pages(1)
|
71
|
+
@page.write(ns, id, title, redirect, sha1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def revision(id, page_id, n, timestamp, contributor, minor,
|
75
|
+
comment, text_deleted, bytes, textid, text)
|
76
|
+
@logger.report_revisions(1)
|
77
|
+
@revision.write(id, page_id, n, timestamp, contributor, minor,
|
78
|
+
comment, text_deleted, bytes, textid, text)
|
79
|
+
end
|
80
|
+
|
81
|
+
def lqt(threadSubject, threadParent, threadAncestor,
|
82
|
+
threadPage, threadID, threadSummaryPage,
|
83
|
+
threadAuthor, threadEditStatus, threadType,
|
84
|
+
threadSignature)
|
85
|
+
@lqt.write(threadSubject, threadParent, threadAncestor,
|
86
|
+
threadPage, threadID, threadSummaryPage,
|
87
|
+
threadAuthor, threadEditStatus, threadType,
|
88
|
+
threadSignature)
|
89
|
+
end
|
90
|
+
|
91
|
+
def done
|
92
|
+
@logger.report_done
|
93
|
+
end
|
94
|
+
|
95
|
+
def skipped(name)
|
96
|
+
# puts "wikiwriter: skipped element #{name}"
|
97
|
+
# raise 'what?'
|
98
|
+
@logger.report_skipped_element(name)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
class NoProgress
|
103
|
+
def report_pages(n)
|
104
|
+
end
|
105
|
+
|
106
|
+
def report_revisions(n)
|
107
|
+
end
|
108
|
+
|
109
|
+
def report_done
|
110
|
+
end
|
111
|
+
|
112
|
+
def report_skipped_element(name)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
class FinalProgress
|
117
|
+
def f(n)
|
118
|
+
parts = []
|
119
|
+
while n >= 1
|
120
|
+
parts.unshift(n % 1000)
|
121
|
+
n /= 1000
|
122
|
+
end
|
123
|
+
head = parts.shift
|
124
|
+
if parts.empty?
|
125
|
+
if head
|
126
|
+
head.to_s
|
127
|
+
else
|
128
|
+
n
|
129
|
+
end
|
130
|
+
else
|
131
|
+
[head, parts.map {|p| sprintf('%03d', p)}.join(',')].join(',')
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def total_skipped
|
136
|
+
@skipped_counts.values.reduce(0, :+)
|
137
|
+
end
|
138
|
+
|
139
|
+
def report_pages(n)
|
140
|
+
@pages += n
|
141
|
+
end
|
142
|
+
|
143
|
+
def report_revisions(n)
|
144
|
+
@revisions += n
|
145
|
+
end
|
146
|
+
|
147
|
+
def report_skipped_element(name)
|
148
|
+
@skipped_counts[name] += 1
|
149
|
+
end
|
150
|
+
|
151
|
+
def show_skipped
|
152
|
+
@skipped_counts.each do |name, count|
|
153
|
+
puts "#{name}: #{count}"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def report_done
|
158
|
+
duration = Time.now - @start_time
|
159
|
+
avg_rate = @revisions / duration
|
160
|
+
h = (duration / 60 / 60).floor
|
161
|
+
m = (duration % (60 * 60) / 60).floor
|
162
|
+
s = (duration % 60).floor
|
163
|
+
# FIXME: Print to STDERR or some log
|
164
|
+
skipped = total_skipped
|
165
|
+
if skipped > 0
|
166
|
+
puts "Couldn't process #{skipped} elements! Detailed breakdown:"
|
167
|
+
show_skipped
|
168
|
+
end
|
169
|
+
puts "Done! Took #{h}h#{m}m#{s}s. Averaged #{f avg_rate.round(0)} rps."
|
170
|
+
end
|
171
|
+
|
172
|
+
def initialize
|
173
|
+
@start_time = Time.now
|
174
|
+
@pages = 0
|
175
|
+
@revisions = 0
|
176
|
+
@skipped_counts = Hash.new 0
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
class RevisionProgress < FinalProgress
|
181
|
+
def announce_progress
|
182
|
+
now = Time.now
|
183
|
+
rps = (@revisions - @previous_revisions) / (now - @previous_time)
|
184
|
+
puts "Page #{f @pages}, rev #{f @revisions} (#{f rps.round(0)} rps)"
|
185
|
+
skipped = total_skipped
|
186
|
+
puts "#{f skipped} unprocessable elements so far."
|
187
|
+
show_skipped
|
188
|
+
@previous_time = now
|
189
|
+
@previous_revisions = @revisions
|
190
|
+
end
|
191
|
+
|
192
|
+
def report_revisions(n)
|
193
|
+
super(n)
|
194
|
+
|
195
|
+
if @revisions - @previous_revisions >= @interval
|
196
|
+
announce_progress
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def report_done
|
201
|
+
announce_progress
|
202
|
+
super
|
203
|
+
end
|
204
|
+
|
205
|
+
def initialize(interval)
|
206
|
+
super()
|
207
|
+
@interval = interval
|
208
|
+
@previous_time = @start_time
|
209
|
+
@previous_revisions = 0
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
class Namespace < WikiAvro::XML::Leaf
|
214
|
+
def name
|
215
|
+
'namespace'
|
216
|
+
end
|
217
|
+
|
218
|
+
def reset
|
219
|
+
# everything is overwritten each cycle anyway
|
220
|
+
end
|
221
|
+
|
222
|
+
def parse_attributes(w, p, r)
|
223
|
+
@key = r['key']
|
224
|
+
@case = r['case']
|
225
|
+
end
|
226
|
+
|
227
|
+
def parse_content(w, p, r)
|
228
|
+
name = r.read_string
|
229
|
+
WikiAvro::XML.skip_tag(w, r, false)
|
230
|
+
w.namespace(@key, @case, name)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
class NamespaceStream < WikiAvro::XML::Stream
|
235
|
+
def initialize
|
236
|
+
super([Namespace.new])
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
class Sitename < WikiAvro::XML::Inserter
|
241
|
+
def initialize
|
242
|
+
super('sitename')
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
class Base < WikiAvro::XML::Inserter
|
247
|
+
def initialize
|
248
|
+
super('base')
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
class Generator < WikiAvro::XML::Inserter
|
253
|
+
def initialize
|
254
|
+
super('generator')
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
class Case < WikiAvro::XML::Inserter
|
259
|
+
def initialize
|
260
|
+
super('case')
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
class Namespaces < WikiAvro::XML::Element
|
265
|
+
def name
|
266
|
+
'namespaces'
|
267
|
+
end
|
268
|
+
|
269
|
+
def initialize
|
270
|
+
super([NamespaceStream.new])
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
class SiteInfo < WikiAvro::XML::Element
|
275
|
+
attr_accessor :sitename
|
276
|
+
attr_accessor :base
|
277
|
+
attr_accessor :generator
|
278
|
+
attr_accessor :case
|
279
|
+
|
280
|
+
def name
|
281
|
+
'siteinfo'
|
282
|
+
end
|
283
|
+
|
284
|
+
def reset
|
285
|
+
@sitename = nil
|
286
|
+
@base = nil
|
287
|
+
@generator = nil
|
288
|
+
@case = nil
|
289
|
+
end
|
290
|
+
|
291
|
+
def initialize
|
292
|
+
super([Sitename.new, Base.new, Generator.new,
|
293
|
+
Case.new, Namespaces.new])
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
class Title < WikiAvro::XML::Inserter
|
298
|
+
def initialize
|
299
|
+
super('title')
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
class Ns < WikiAvro::XML::Inserter
|
304
|
+
def initialize
|
305
|
+
super('ns')
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
class Id < WikiAvro::XML::Inserter
|
310
|
+
def initialize
|
311
|
+
super('id')
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
class Redirect < WikiAvro::XML::Leaf
|
316
|
+
def name
|
317
|
+
'redirect'
|
318
|
+
end
|
319
|
+
|
320
|
+
def parse_attributes(w, p, r)
|
321
|
+
# puts "redirect: #{r['title']}"
|
322
|
+
p.redirect = r['title']
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
class Sha1 < WikiAvro::XML::Inserter
|
327
|
+
def initialize
|
328
|
+
super('sha1')
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
class PageFlags < WikiAvro::XML::Group
|
333
|
+
def initialize
|
334
|
+
super [{:element => Redirect.new, :min => 0, :max => 1},
|
335
|
+
{:element => Sha1.new, :min => 0, :max => 1}]
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
class Timestamp < WikiAvro::XML::Inserter
|
340
|
+
def initialize
|
341
|
+
super('timestamp')
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
class Username < WikiAvro::XML::Inserter
|
346
|
+
def initialize
|
347
|
+
super('username')
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
class Ip < WikiAvro::XML::Inserter
|
352
|
+
def initialize
|
353
|
+
super('ip')
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
class ContributorGroup < WikiAvro::XML::Group
|
358
|
+
def optional?
|
359
|
+
true
|
360
|
+
end
|
361
|
+
|
362
|
+
def initialize
|
363
|
+
super [{:element => Username.new, :min => 0, :max => 1},
|
364
|
+
{:element => Id.new, :min => 0, :max => 1},
|
365
|
+
{:element => Ip.new, :min => 0, :max => 1}]
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
class Contributor < WikiAvro::XML::Element
|
370
|
+
def name
|
371
|
+
'contributor'
|
372
|
+
end
|
373
|
+
|
374
|
+
attr_accessor :id
|
375
|
+
attr_accessor :username
|
376
|
+
attr_accessor :ip
|
377
|
+
|
378
|
+
def reset
|
379
|
+
@id = nil
|
380
|
+
@username = nil
|
381
|
+
@ip = nil
|
382
|
+
@deleted = nil
|
383
|
+
end
|
384
|
+
|
385
|
+
def parse_attributes(w, p, r)
|
386
|
+
@deleted = r['deleted']
|
387
|
+
end
|
388
|
+
|
389
|
+
def handle_content(w, p, r)
|
390
|
+
p.contributor = {:deleted => @deleted, :id => id,
|
391
|
+
:username => username, :ip => ip}
|
392
|
+
end
|
393
|
+
|
394
|
+
def initialize
|
395
|
+
super([ContributorGroup.new])
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
class Minor < WikiAvro::XML::Inserter
|
400
|
+
def initialize
|
401
|
+
super('minor')
|
402
|
+
end
|
403
|
+
end
|
404
|
+
|
405
|
+
class Comment < WikiAvro::XML::Leaf
|
406
|
+
def name
|
407
|
+
'comment'
|
408
|
+
end
|
409
|
+
|
410
|
+
def parse_attributes(w, p, r)
|
411
|
+
deleted = r['deleted']
|
412
|
+
comment = r.read_string
|
413
|
+
p.comment = {:deleted => deleted,
|
414
|
+
:comment => comment}
|
415
|
+
end
|
416
|
+
end
|
417
|
+
|
418
|
+
class RevisionFlags < WikiAvro::XML::Group
|
419
|
+
def initialize
|
420
|
+
super [{:element => Minor.new, :min => 0, :max => 1},
|
421
|
+
{:element => Comment.new, :min => 0, :max => 1}]
|
422
|
+
end
|
423
|
+
end
|
424
|
+
|
425
|
+
class Text < WikiAvro::XML::Inserter
|
426
|
+
def parse_attributes(w, p, r)
|
427
|
+
p.text_deleted = r['deleted']
|
428
|
+
p.textid = r['id']
|
429
|
+
p.bytes = r['bytes']
|
430
|
+
end
|
431
|
+
|
432
|
+
def initialize
|
433
|
+
super('text')
|
434
|
+
end
|
435
|
+
end
|
436
|
+
|
437
|
+
class Revision < WikiAvro::XML::Element
|
438
|
+
attr_accessor :id
|
439
|
+
attr_accessor :timestamp
|
440
|
+
attr_accessor :contributor
|
441
|
+
attr_accessor :minor
|
442
|
+
attr_accessor :comment
|
443
|
+
attr_accessor :text_deleted
|
444
|
+
attr_accessor :bytes
|
445
|
+
attr_accessor :textid
|
446
|
+
attr_accessor :text
|
447
|
+
|
448
|
+
def name
|
449
|
+
'revision'
|
450
|
+
end
|
451
|
+
|
452
|
+
def reset
|
453
|
+
id = nil
|
454
|
+
timestamp = nil
|
455
|
+
contributor = nil
|
456
|
+
minor = nil
|
457
|
+
comment = nil
|
458
|
+
text_deleted = nil
|
459
|
+
bytes = nil
|
460
|
+
textid = nil
|
461
|
+
text = nil
|
462
|
+
end
|
463
|
+
|
464
|
+
def handle_content(w, p, r)
|
465
|
+
p.revision_count += 1
|
466
|
+
n = p.revision_count
|
467
|
+
w.revision(id, p.id, n, timestamp, contributor, minor,
|
468
|
+
comment, text_deleted, bytes, textid, text)
|
469
|
+
end
|
470
|
+
|
471
|
+
def initialize
|
472
|
+
super([Id.new, Timestamp.new, Contributor.new,
|
473
|
+
RevisionFlags.new, Text.new])
|
474
|
+
end
|
475
|
+
end
|
476
|
+
|
477
|
+
class RevStream < WikiAvro::XML::Stream
|
478
|
+
def initialize
|
479
|
+
super([Revision.new])
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
class ThreadSubject < WikiAvro::XML::Inserter
|
484
|
+
def initialize
|
485
|
+
super('ThreadSubject', 'threadSubject')
|
486
|
+
end
|
487
|
+
end
|
488
|
+
|
489
|
+
class ThreadParent < WikiAvro::XML::Inserter
|
490
|
+
def initialize
|
491
|
+
super('ThreadParent', 'threadParent')
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
class ThreadAncestor < WikiAvro::XML::Inserter
|
496
|
+
def initialize
|
497
|
+
super('ThreadAncestor', 'threadAncestor')
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
class ThreadParentGroup < WikiAvro::XML::Group
|
502
|
+
def optional?
|
503
|
+
true
|
504
|
+
end
|
505
|
+
|
506
|
+
def initialize
|
507
|
+
super [{:element => ThreadParent.new, :min => 0, :max => 1},
|
508
|
+
{:element => ThreadAncestor.new, :min => 0, :max => 1}]
|
509
|
+
end
|
510
|
+
end
|
511
|
+
|
512
|
+
class ThreadPage < WikiAvro::XML::Inserter
|
513
|
+
def initialize
|
514
|
+
super('ThreadPage', 'threadPage')
|
515
|
+
end
|
516
|
+
end
|
517
|
+
|
518
|
+
class ThreadID < WikiAvro::XML::Inserter
|
519
|
+
def initialize
|
520
|
+
super('ThreadID', 'threadID')
|
521
|
+
end
|
522
|
+
end
|
523
|
+
|
524
|
+
class ThreadSummaryPage < WikiAvro::XML::Inserter
|
525
|
+
def initialize
|
526
|
+
super('ThreadSummaryPage', 'threadSummaryPage')
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
class ThreadSummaryPageGroup < WikiAvro::XML::Group
|
531
|
+
def optional?
|
532
|
+
true
|
533
|
+
end
|
534
|
+
|
535
|
+
def initialize
|
536
|
+
super [{:element => ThreadSummaryPage.new, :min => 0, :max => 1}]
|
537
|
+
end
|
538
|
+
end
|
539
|
+
|
540
|
+
class ThreadAuthor < WikiAvro::XML::Inserter
|
541
|
+
def initialize
|
542
|
+
super('ThreadAuthor', 'threadAuthor')
|
543
|
+
end
|
544
|
+
end
|
545
|
+
|
546
|
+
class ThreadEditStatus < WikiAvro::XML::Inserter
|
547
|
+
def initialize
|
548
|
+
super('ThreadEditStatus', 'threadEditStatus')
|
549
|
+
end
|
550
|
+
end
|
551
|
+
|
552
|
+
class ThreadType < WikiAvro::XML::Inserter
|
553
|
+
def initialize
|
554
|
+
super('ThreadType', 'threadType')
|
555
|
+
end
|
556
|
+
end
|
557
|
+
|
558
|
+
class ThreadSignature < WikiAvro::XML::Inserter
|
559
|
+
def initialize
|
560
|
+
super('ThreadSignature', 'threadSignature')
|
561
|
+
end
|
562
|
+
end
|
563
|
+
|
564
|
+
class DiscussionThreading < WikiAvro::XML::Element
|
565
|
+
attr_accessor :threadSubject, :threadParent, :threadAncestor,
|
566
|
+
:threadPage, :threadID, :threadSummaryPage,
|
567
|
+
:threadAuthor, :threadEditStatus, :threadType,
|
568
|
+
:threadSignature
|
569
|
+
def name
|
570
|
+
'DiscussionThreading'
|
571
|
+
end
|
572
|
+
|
573
|
+
def reset
|
574
|
+
threadSubject = nil
|
575
|
+
threadParent = nil
|
576
|
+
threadAncestor = nil
|
577
|
+
threadPage = nil
|
578
|
+
threadID = nil
|
579
|
+
threadSummaryPage = nil
|
580
|
+
threadAuthor = nil
|
581
|
+
threadEditStatus = nil
|
582
|
+
threadType = nil
|
583
|
+
threadSignature = nil
|
584
|
+
end
|
585
|
+
|
586
|
+
def handle_content(w, p, r)
|
587
|
+
w.lqt(threadSubject, threadParent, threadAncestor,
|
588
|
+
threadPage, threadID, threadSummaryPage,
|
589
|
+
threadAuthor, threadEditStatus, threadType,
|
590
|
+
threadSignature)
|
591
|
+
end
|
592
|
+
|
593
|
+
def initialize
|
594
|
+
super([ThreadSubject.new, ThreadParentGroup.new, ThreadPage.new,
|
595
|
+
ThreadID.new, ThreadSummaryPageGroup.new, ThreadAuthor.new,
|
596
|
+
ThreadEditStatus.new, ThreadType.new, ThreadSignature.new])
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
class DiscussionThreadingGroup < WikiAvro::XML::Group
|
601
|
+
def optional?
|
602
|
+
true
|
603
|
+
end
|
604
|
+
|
605
|
+
def initialize
|
606
|
+
super [{:element => DiscussionThreading.new, :min => 0, :max => 1}]
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
class Page < WikiAvro::XML::Element
|
611
|
+
attr_accessor :title
|
612
|
+
attr_accessor :ns
|
613
|
+
attr_accessor :id
|
614
|
+
attr_accessor :redirect
|
615
|
+
attr_accessor :sha1
|
616
|
+
attr_accessor :revision_count
|
617
|
+
|
618
|
+
def name
|
619
|
+
'page'
|
620
|
+
end
|
621
|
+
|
622
|
+
def reset
|
623
|
+
title = nil
|
624
|
+
ns = nil
|
625
|
+
id = nil
|
626
|
+
redirect = nil
|
627
|
+
sha1 = nil
|
628
|
+
revision_count = nil
|
629
|
+
@revision_count = 0
|
630
|
+
end
|
631
|
+
|
632
|
+
def handle_content(w, p, r)
|
633
|
+
w.page(ns, id, title, redirect, sha1)
|
634
|
+
end
|
635
|
+
|
636
|
+
def initialize
|
637
|
+
super([Title.new, Ns.new, Id.new, PageFlags.new,
|
638
|
+
RevStream.new, DiscussionThreadingGroup.new,
|
639
|
+
RevStream.new])
|
640
|
+
end
|
641
|
+
end
|
642
|
+
|
643
|
+
class PageStream < WikiAvro::XML::Stream
|
644
|
+
def initialize
|
645
|
+
super([Page.new])
|
646
|
+
end
|
647
|
+
end
|
648
|
+
|
649
|
+
class WikiDump < WikiAvro::XML::Element
|
650
|
+
attr_reader :version
|
651
|
+
|
652
|
+
def name
|
653
|
+
'mediawiki'
|
654
|
+
end
|
655
|
+
|
656
|
+
protected
|
657
|
+
|
658
|
+
def parse_attributes(w, p, r)
|
659
|
+
@version = r['version']
|
660
|
+
warn 'dump version != 0.6' if @version != '0.6'
|
661
|
+
end
|
662
|
+
|
663
|
+
def handle_content(w, p, r)
|
664
|
+
w.done
|
665
|
+
end
|
666
|
+
|
667
|
+
def initialize
|
668
|
+
super([SiteInfo.new, PageStream.new])
|
669
|
+
end
|
670
|
+
end
|
671
|
+
end
|
data/lib/wikiavro/xml.rb
ADDED
@@ -0,0 +1,282 @@
|
|
1
|
+
# Parser functions will assume to possibly start on their opening tag,
|
2
|
+
# and stop parsing right after their end tag. There're probably loads
|
3
|
+
# of bugs waiting for when you nest tags of the same name.
|
4
|
+
|
5
|
+
module WikiAvro::XML
|
6
|
+
def self.to_tag(reader)
|
7
|
+
# puts 'to_tag: moving to tag'
|
8
|
+
loop do
|
9
|
+
case reader.node_type
|
10
|
+
when XML::Reader::TYPE_ELEMENT
|
11
|
+
# puts "to_tag: got tag #{reader.name}"
|
12
|
+
return true
|
13
|
+
when XML::Reader::TYPE_END_ELEMENT
|
14
|
+
# puts "to_tag: got end tag #{reader.name}"
|
15
|
+
return false
|
16
|
+
end
|
17
|
+
|
18
|
+
break if !reader.read
|
19
|
+
end
|
20
|
+
|
21
|
+
# XML::Reader will probably raise its own exception before we ever
|
22
|
+
# could get here
|
23
|
+
raise EOFError.new('no opening tag')
|
24
|
+
end
|
25
|
+
|
26
|
+
# Do not call this while you are on the opening tag
|
27
|
+
def self.exit_tag(writer, reader, name)
|
28
|
+
nest = 1
|
29
|
+
|
30
|
+
# puts "exit_tag: exiting #{name}"
|
31
|
+
|
32
|
+
loop do
|
33
|
+
case reader.node_type
|
34
|
+
when XML::Reader::TYPE_ELEMENT
|
35
|
+
# puts "exit_tag: entered #{reader.name}"
|
36
|
+
writer.skipped(reader.name)
|
37
|
+
nest += 1 if reader.name == name
|
38
|
+
when XML::Reader::TYPE_END_ELEMENT
|
39
|
+
# puts "exit_tag: exited #{reader.name}"
|
40
|
+
nest -= 1 if reader.name == name
|
41
|
+
end
|
42
|
+
reader.read
|
43
|
+
if nest == 0
|
44
|
+
# puts "exit_tag: successful exit, now at #{reader.name}"
|
45
|
+
break
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Call this to skip when reader is on the opening tag
|
51
|
+
def self.skip_tag(writer, reader, skipping)
|
52
|
+
nest = 1
|
53
|
+
name = reader.name
|
54
|
+
|
55
|
+
# puts "skip_tag: skipping #{name}"
|
56
|
+
|
57
|
+
if reader.empty_element?
|
58
|
+
# puts "skip_tag: element was empty; skipped"
|
59
|
+
writer.skipped(name) if skipping
|
60
|
+
reader.read
|
61
|
+
return
|
62
|
+
end
|
63
|
+
|
64
|
+
while reader.read
|
65
|
+
case reader.node_type
|
66
|
+
when XML::Reader::TYPE_ELEMENT
|
67
|
+
# puts "skip_tag: entered #{reader.name}"
|
68
|
+
writer.skipped(reader.name) if skipping
|
69
|
+
nest += 1 if reader.name == name
|
70
|
+
when XML::Reader::TYPE_END_ELEMENT
|
71
|
+
# puts "skip_tag: exited #{reader.name}"
|
72
|
+
nest -= 1 if reader.name == name
|
73
|
+
end
|
74
|
+
if nest == 0
|
75
|
+
reader.read
|
76
|
+
break
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class MissingElement < Exception
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.to_element(writer, reader, name)
|
85
|
+
while WikiAvro::XML::to_tag(reader)
|
86
|
+
# puts "to_element: saw #{reader.name}"
|
87
|
+
if reader.name == name
|
88
|
+
return
|
89
|
+
else
|
90
|
+
# puts "to_element: skipping #{reader.name}"
|
91
|
+
WikiAvro::XML.skip_tag(writer, reader, true)
|
92
|
+
# puts "to_element: skipped"
|
93
|
+
end
|
94
|
+
|
95
|
+
break if !reader.read
|
96
|
+
end
|
97
|
+
|
98
|
+
raise MissingElement.new(name)
|
99
|
+
end
|
100
|
+
|
101
|
+
class Element
|
102
|
+
attr_reader :attr
|
103
|
+
|
104
|
+
def name
|
105
|
+
raise NotImplementedError.new('name')
|
106
|
+
end
|
107
|
+
|
108
|
+
def optional?
|
109
|
+
false
|
110
|
+
end
|
111
|
+
|
112
|
+
def parse(output, parent, reader)
|
113
|
+
if parent.nil? && reader.name != self.name
|
114
|
+
raise RuntimeError.new('reader.name != self.name')
|
115
|
+
else
|
116
|
+
WikiAvro::XML::to_element(output, reader, self.name)
|
117
|
+
end
|
118
|
+
|
119
|
+
reset
|
120
|
+
@attr = parse_attributes(output, parent, reader)
|
121
|
+
parse_content(output, parent, reader)
|
122
|
+
handle_content(output, parent, reader)
|
123
|
+
end
|
124
|
+
|
125
|
+
protected
|
126
|
+
|
127
|
+
# Instances will be reused. Subclasses that keep state which needs
|
128
|
+
# to be discarded after each parse should implement this.
|
129
|
+
def reset
|
130
|
+
end
|
131
|
+
|
132
|
+
def parse_attributes(w, p, r)
|
133
|
+
# no attributes parsed
|
134
|
+
end
|
135
|
+
|
136
|
+
# parse_content should move the reader away from the children's
|
137
|
+
# parent's opening tag. It should leave reader positioned after
|
138
|
+
# the closing tag.
|
139
|
+
def parse_content(w, p, r)
|
140
|
+
if r.empty_element?
|
141
|
+
@children.each do |c|
|
142
|
+
raise MissingElement.new(c.name) if !c.optional?
|
143
|
+
end
|
144
|
+
r.read
|
145
|
+
return
|
146
|
+
end
|
147
|
+
|
148
|
+
# Move away from our opening tag
|
149
|
+
r.read
|
150
|
+
@children.each do |c|
|
151
|
+
# puts "element: parsing #{c.class}"
|
152
|
+
c.parse(w, self, r)
|
153
|
+
# puts "parsed #{c.class}"
|
154
|
+
end
|
155
|
+
|
156
|
+
if r.empty_element? && r.name == self.name
|
157
|
+
# puts "got empty: #{r.name}"
|
158
|
+
r.read
|
159
|
+
# puts "now got this: #{r.name}"
|
160
|
+
else
|
161
|
+
# puts "mopping up #{self.name}"
|
162
|
+
WikiAvro::XML.exit_tag(w, r, self.name)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def handle_content(w, p, r)
|
167
|
+
# nothing done
|
168
|
+
end
|
169
|
+
|
170
|
+
private
|
171
|
+
|
172
|
+
def initialize(children)
|
173
|
+
@children = children
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class Leaf < Element
|
178
|
+
def initialize
|
179
|
+
super([])
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
class Inserter < Leaf
|
184
|
+
attr_reader :name
|
185
|
+
|
186
|
+
def parse_content(w, p, r)
|
187
|
+
p.send(@writer, r.read_string)
|
188
|
+
# puts "inserter: exiting #{@name}"
|
189
|
+
WikiAvro::XML.skip_tag(w, r, false)
|
190
|
+
# puts "exited"
|
191
|
+
end
|
192
|
+
|
193
|
+
def initialize(name, target=name)
|
194
|
+
super()
|
195
|
+
@name = name
|
196
|
+
@writer = (target + '=').to_sym
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
class Stream
|
201
|
+
def optional?
|
202
|
+
true
|
203
|
+
end
|
204
|
+
|
205
|
+
def parse(output, parent, reader)
|
206
|
+
# puts "stream: parsing #{self.class}"
|
207
|
+
|
208
|
+
while WikiAvro::XML::to_tag(reader)
|
209
|
+
e = @elements[reader.name]
|
210
|
+
|
211
|
+
if e.nil?
|
212
|
+
# puts "stream: rejected #{reader.name}"
|
213
|
+
return
|
214
|
+
else
|
215
|
+
# puts "stream: accepted #{reader.name}"
|
216
|
+
e.parse(output, parent, reader)
|
217
|
+
reader.read
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# puts "stream: ran to parent end"
|
222
|
+
end
|
223
|
+
|
224
|
+
private
|
225
|
+
|
226
|
+
def initialize(elements)
|
227
|
+
@elements = {}
|
228
|
+
elements.each { |e| @elements[e.name] = e }
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
class TooManyElements < Exception
|
233
|
+
end
|
234
|
+
|
235
|
+
class TooFewElements < Exception
|
236
|
+
end
|
237
|
+
|
238
|
+
class Group
|
239
|
+
# remember to override this if untrue, especially if it might be
|
240
|
+
# within an empty element
|
241
|
+
def optional?
|
242
|
+
false
|
243
|
+
end
|
244
|
+
|
245
|
+
def parse(output, parent, reader)
|
246
|
+
@n.keys.each {|k| @n[k] = 0}
|
247
|
+
|
248
|
+
while WikiAvro::XML::to_tag(reader)
|
249
|
+
e = @elements[reader.name]
|
250
|
+
|
251
|
+
if e.nil?
|
252
|
+
@elements.each do |k, v|
|
253
|
+
raise TooFewElements.new(k) if @n[k] < v[:min]
|
254
|
+
# this ought to be a redundant check
|
255
|
+
raise TooManyElements.new(k) if @n[k] > v[:max]
|
256
|
+
end
|
257
|
+
# puts "group: rejected #{reader.name}"
|
258
|
+
return
|
259
|
+
else
|
260
|
+
# puts "group: accepted #{reader.name}"
|
261
|
+
name = reader.name
|
262
|
+
@n[name] += 1
|
263
|
+
raise TooManyElements.new(name) if @n[name] > e[:max]
|
264
|
+
e[:element].parse(output, parent, reader)
|
265
|
+
reader.read
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
# puts 'group: ran to parent end'
|
270
|
+
end
|
271
|
+
|
272
|
+
def initialize(elements)
|
273
|
+
@elements = {}
|
274
|
+
@n = Hash.new 0
|
275
|
+
elements.each do |e|
|
276
|
+
name = e[:element].name
|
277
|
+
@elements[name] = e
|
278
|
+
@n[name] = 0
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wikiavro
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Someon
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: libxml-ruby
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: avro
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.7'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.7'
|
41
|
+
description:
|
42
|
+
email: someon@openmailbox.org
|
43
|
+
executables:
|
44
|
+
- wikiavro
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- bin/wikiavro
|
49
|
+
- lib/wikiavro.rb
|
50
|
+
- lib/wikiavro/avro.rb
|
51
|
+
- lib/wikiavro/mediawiki.rb
|
52
|
+
- lib/wikiavro/xml.rb
|
53
|
+
homepage:
|
54
|
+
licenses:
|
55
|
+
- GPL-3.0+
|
56
|
+
metadata: {}
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
requirements: []
|
72
|
+
rubyforge_project:
|
73
|
+
rubygems_version: 2.2.2
|
74
|
+
signing_key:
|
75
|
+
specification_version: 4
|
76
|
+
summary: Convert MediaWiki XML dumps to Avro
|
77
|
+
test_files: []
|