wikiavro 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/wikiavro +90 -0
- data/lib/wikiavro.rb +6 -0
- data/lib/wikiavro/avro.rb +199 -0
- data/lib/wikiavro/mediawiki.rb +671 -0
- data/lib/wikiavro/xml.rb +282 -0
- metadata +77 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c64ef5f767535d9432d8f589411f76dedb6a4b04
|
4
|
+
data.tar.gz: b846af138fe797fa09e9f248bcdb3298e988b5fb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b5f7cba59679bc913ceaddf95e97f8ed7c63d9f54660d81680db419da09abf2c61ddadd2f2a75b0e199fbfe9f710363edcd889550f76f44acdd6a34e591f69a2
|
7
|
+
data.tar.gz: 8b63a6ae13ce31cc31eb2f5698a0bd586c9645dfbce1be6ae52e45f90315af3015ed1e1f17badaa4f06425483369a369af6adefd0dae2c8ef0555d8f05193f01
|
data/bin/wikiavro
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'wikiavro'
|
4
|
+
require 'xml'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
Version = WikiAvro::Version
|
8
|
+
|
9
|
+
logger = WikiAvro::MediaWiki::RevisionProgress.new 10000
|
10
|
+
overwrite = false
|
11
|
+
deflate = false
|
12
|
+
ns = nil
|
13
|
+
page = nil
|
14
|
+
rev = nil
|
15
|
+
lqt = nil
|
16
|
+
|
17
|
+
opt_parser = OptionParser.new do |opts|
|
18
|
+
opts.banner = 'Usage: wikiavro [options] [INFILE]'
|
19
|
+
|
20
|
+
opts.on('-q', '--quiet', "Don't report progress") do |v|
|
21
|
+
logger = WikiAvro::MediaWiki::NoProgress.new
|
22
|
+
end
|
23
|
+
|
24
|
+
opts.on('-v LEVEL', '--verbose=LEVEL', Integer,
|
25
|
+
'Report progress after every LEVEL revisions parsed') do |lvl|
|
26
|
+
logger = WikiAvro::MediaWiki::RevisionProgress.new lvl.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on('-o', '--overwrite') do |o|
|
30
|
+
overwrite = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on('-d', '--deflate', 'Enable Avro internal compression') do |d|
|
34
|
+
deflate = d
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on('-n OUTFILE', '--namespaces=OUTFILE') do |path|
|
38
|
+
ns = path
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on('-p OUTFILE', '--pages=OUTFILE') do |path|
|
42
|
+
page = path
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on('-r OUTFILE', '--revisions=OUTFILE') do |path|
|
46
|
+
rev = path
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on('-l OUTFILE', '--liquidthreads=OUTFILE') do |path|
|
50
|
+
lqt = path
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
opt_parser.parse!
|
55
|
+
|
56
|
+
abort('You must specify --namespaces') if ns.nil?
|
57
|
+
abort('You must specify --pages') if page.nil?
|
58
|
+
abort('You must specify --revisions') if rev.nil?
|
59
|
+
abort('You must specify --liquidthreads') if lqt.nil?
|
60
|
+
|
61
|
+
if !overwrite
|
62
|
+
[ns, page, rev, lqt].each do |path|
|
63
|
+
if File.exists? path
|
64
|
+
abort("#{path} already exists! pass --overwrite to proceed anyway")
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
ns = WikiAvro::Avro::NamespaceWriter.new ns, deflate
|
70
|
+
page = WikiAvro::Avro::PageWriter.new page, deflate
|
71
|
+
rev = WikiAvro::Avro::RevisionWriter.new rev, deflate
|
72
|
+
lqt = WikiAvro::Avro::LqtWriter.new lqt, deflate
|
73
|
+
|
74
|
+
writer = WikiAvro::MediaWiki::WikiWriter.new :logger => logger,
|
75
|
+
:namespace => ns,
|
76
|
+
:page => page,
|
77
|
+
:revision => rev,
|
78
|
+
:lqt => lqt
|
79
|
+
|
80
|
+
mw = WikiAvro::MediaWiki::WikiDump.new
|
81
|
+
|
82
|
+
xml = XML::Reader.io(ARGF)
|
83
|
+
xml.read
|
84
|
+
|
85
|
+
mw.parse(writer, nil, xml)
|
86
|
+
|
87
|
+
ns.close
|
88
|
+
page.close
|
89
|
+
rev.close
|
90
|
+
lqt.close
|
data/lib/wikiavro.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
require 'avro'
|
2
|
+
|
3
|
+
module WikiAvro::Avro
|
4
|
+
NAMESPACE_SCHEMA = <<-EOS
|
5
|
+
{
|
6
|
+
"namespace": "org.rationalwiki",
|
7
|
+
"name": "Namespace",
|
8
|
+
"type": "record",
|
9
|
+
"fields": [
|
10
|
+
{"name": "key", "type": "int"},
|
11
|
+
{"name": "case", "type": "string"},
|
12
|
+
{"name": "name", "type": "string"}
|
13
|
+
]
|
14
|
+
}
|
15
|
+
EOS
|
16
|
+
|
17
|
+
PAGE_SCHEMA = <<-EOS
|
18
|
+
{
|
19
|
+
"namespace": "org.rationalwiki",
|
20
|
+
"name": "Page",
|
21
|
+
"type": "record",
|
22
|
+
"fields": [
|
23
|
+
{"name": "id", "type": "long"},
|
24
|
+
{"name": "ns", "type": "long"},
|
25
|
+
{"name": "title", "type": "string"},
|
26
|
+
{"name": "redirect", "type": ["null", "string"]},
|
27
|
+
{"name": "sha1", "type": ["null", "string"]}
|
28
|
+
]
|
29
|
+
}
|
30
|
+
EOS
|
31
|
+
|
32
|
+
REVISION_SCHEMA = <<-EOS
|
33
|
+
{
|
34
|
+
"namespace": "org.rationalwiki",
|
35
|
+
"name": "Revision",
|
36
|
+
"type": "record",
|
37
|
+
"fields": [
|
38
|
+
{"name": "id", "type": "long"},
|
39
|
+
{"name": "page_id", "type": "long"},
|
40
|
+
{"name": "n", "type": "long"},
|
41
|
+
{"name": "timestamp", "type": "string"},
|
42
|
+
{"name": "contributor", "type": ["null", {
|
43
|
+
"namespace": "org.rationalwiki",
|
44
|
+
"name": "Contributor",
|
45
|
+
"type": "record",
|
46
|
+
"fields": [
|
47
|
+
{"name": "id", "type": ["null", "long"]},
|
48
|
+
{"name": "username", "type": ["null", "string"]},
|
49
|
+
{"name": "ip", "type": ["null", "string"]}
|
50
|
+
]
|
51
|
+
}]},
|
52
|
+
{"name": "minor", "type": "boolean"},
|
53
|
+
{"name": "comment", "type": ["null", "string"]},
|
54
|
+
{"name": "bytes", "type": "long"},
|
55
|
+
{"name": "textid", "type": ["null", "string"]},
|
56
|
+
{"name": "text", "type": ["null", "string"]}
|
57
|
+
]
|
58
|
+
}
|
59
|
+
EOS
|
60
|
+
|
61
|
+
LQT_SCHEMA = <<-EOS
|
62
|
+
{
|
63
|
+
"namespace": "org.rationalwiki",
|
64
|
+
"name": "Threading",
|
65
|
+
"type": "record",
|
66
|
+
"fields": [
|
67
|
+
{"name": "subject", "type": "string"},
|
68
|
+
{"name": "parent", "type": ["null", "long"]},
|
69
|
+
{"name": "ancestor", "type": ["null", "long"]},
|
70
|
+
{"name": "page", "type": "string"},
|
71
|
+
{"name": "id", "type": "long"},
|
72
|
+
{"name": "summary_page", "type": ["null", "string"]},
|
73
|
+
{"name": "author", "type": "string"},
|
74
|
+
{"name": "edit_status", "type": "string"},
|
75
|
+
{"name": "type", "type": "string"},
|
76
|
+
{"name": "signature", "type": ["null", "string"]}
|
77
|
+
]
|
78
|
+
}
|
79
|
+
EOS
|
80
|
+
|
81
|
+
class AvroWriter
|
82
|
+
def schema
|
83
|
+
raise NotImplementedError
|
84
|
+
end
|
85
|
+
|
86
|
+
def close
|
87
|
+
@writer.close
|
88
|
+
end
|
89
|
+
|
90
|
+
protected
|
91
|
+
|
92
|
+
def encode(data)
|
93
|
+
@writer << data
|
94
|
+
end
|
95
|
+
|
96
|
+
def initialize(path, deflate=false)
|
97
|
+
if !deflate
|
98
|
+
@writer = Avro::DataFile.open(path, 'w', schema)
|
99
|
+
else
|
100
|
+
@writer = Avro::DataFile.open(path, 'w', schema, 'deflate')
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
class NamespaceWriter < AvroWriter
|
106
|
+
def schema
|
107
|
+
NAMESPACE_SCHEMA
|
108
|
+
end
|
109
|
+
|
110
|
+
def write(key, casetype, name)
|
111
|
+
encode 'key' => key.to_i,
|
112
|
+
'case' => casetype,
|
113
|
+
'name' => name || ''
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
class PageWriter < AvroWriter
|
118
|
+
def schema
|
119
|
+
PAGE_SCHEMA
|
120
|
+
end
|
121
|
+
|
122
|
+
def write(ns, id, title, redirect, sha1)
|
123
|
+
encode 'id' => id.to_i,
|
124
|
+
'ns' => ns.to_i,
|
125
|
+
'title' => title,
|
126
|
+
'redirect' => redirect,
|
127
|
+
'sha1' => sha1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
class RevisionWriter < AvroWriter
|
132
|
+
def schema
|
133
|
+
REVISION_SCHEMA
|
134
|
+
end
|
135
|
+
|
136
|
+
def write(id, page_id, n, timestamp, contributor, minor,
|
137
|
+
comment, text_deleted, bytes, textid, text)
|
138
|
+
if !contributor[:deleted].nil? && !(contributor[:id].nil? &&
|
139
|
+
contributor[:username].nil? &&
|
140
|
+
contributor[:ip].nil?)
|
141
|
+
raise 'deleted contributor has content'
|
142
|
+
end
|
143
|
+
|
144
|
+
if contributor[:deleted].nil?
|
145
|
+
contributor.delete :deleted
|
146
|
+
contributor = {
|
147
|
+
'username' => contributor[:username],
|
148
|
+
'id' => contributor[:id].to_i,
|
149
|
+
'ip' => contributor[:ip]
|
150
|
+
}
|
151
|
+
else
|
152
|
+
contributor = nil
|
153
|
+
end
|
154
|
+
|
155
|
+
if comment[:deleted].nil?
|
156
|
+
comment = comment[:comment]
|
157
|
+
else
|
158
|
+
raise 'deleted comment has content' if comment[:comment]
|
159
|
+
comment = nil
|
160
|
+
end
|
161
|
+
|
162
|
+
text = nil if !text_deleted.nil?
|
163
|
+
|
164
|
+
encode 'id' => id.to_i,
|
165
|
+
'page_id' => page_id.to_i,
|
166
|
+
'n' => n.to_i,
|
167
|
+
'timestamp' => timestamp,
|
168
|
+
'contributor' => contributor,
|
169
|
+
'minor' => minor.nil?,
|
170
|
+
'comment' => comment,
|
171
|
+
'bytes' => bytes.to_i,
|
172
|
+
'textid' => textid,
|
173
|
+
'text' => text
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class LqtWriter < AvroWriter
|
178
|
+
def schema
|
179
|
+
LQT_SCHEMA
|
180
|
+
end
|
181
|
+
|
182
|
+
def write(subject, parent, ancestor, page, id, summary_page,
|
183
|
+
author, edit_status, type, signature)
|
184
|
+
parent = parent.to_i if parent
|
185
|
+
ancestor = ancestor.to_i if ancestor
|
186
|
+
|
187
|
+
encode 'subject' => subject,
|
188
|
+
'parent' => parent,
|
189
|
+
'ancestor' => ancestor,
|
190
|
+
'page' => page,
|
191
|
+
'id' => id.to_i,
|
192
|
+
'summary_page' => summary_page,
|
193
|
+
'author' => author,
|
194
|
+
'edit_status' => edit_status,
|
195
|
+
'type' => type,
|
196
|
+
'signature' => signature
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,671 @@
|
|
1
|
+
require 'wikiavro/xml'
|
2
|
+
|
3
|
+
# RW declares schema 0.6 but has redirects as in 0.7
|
4
|
+
#
|
5
|
+
# RW has <sha1> tags right after <redirect> - they are indented to a
|
6
|
+
# different level too. The <sha1>s are missing within <revision>s
|
7
|
+
# where they should be.
|
8
|
+
#
|
9
|
+
# Schema claims discussionthreadinginfo, but actual tag is
|
10
|
+
# DiscussionThreading. Schema does not describe ThreadSummaryPage or
|
11
|
+
# ThreadSignature. Schema does not describe which LQT tags are
|
12
|
+
# omissible. Schema says thread info should always come after
|
13
|
+
# revisions, but it does not.
|
14
|
+
|
15
|
+
module WikiAvro::MediaWiki
|
16
|
+
class NamespacePrinter
|
17
|
+
def write(key, casetype, name)
|
18
|
+
puts "namespace #{key}: \"#{name}\" #{casetype}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class PagePrinter
|
23
|
+
def write(ns, id, title, redirect, sha1)
|
24
|
+
puts "page \"#{title}\": #{id} #{ns} #{redirect} #{sha1}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class RevisionPrinter
|
29
|
+
def write(id, page_id, n, timestamp, contributor, minor,
|
30
|
+
comment, text_deleted, bytes, textid, text)
|
31
|
+
puts "rev #{page_id} #{n}: #{timestamp} " +
|
32
|
+
"#{bytes} #{contributor[:username]}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class LqtPrinter
|
37
|
+
def write(threadSubject, threadParent, threadAncestor,
|
38
|
+
threadPage, threadID, threadSummaryPage,
|
39
|
+
threadAuthor, threadEditStatus, threadType,
|
40
|
+
threadSignature)
|
41
|
+
puts "thread #{threadSubject} #{threadParent} #{threadAncestor} " +
|
42
|
+
"#{threadAuthor} #{threadEditStatus} #{threadType}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class NullWriter
|
47
|
+
def method_missing(target, *args, &block)
|
48
|
+
# All methods return nil
|
49
|
+
end
|
50
|
+
|
51
|
+
def initialize
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class WikiWriter
|
56
|
+
def initialize(writers)
|
57
|
+
null_writer = NullWriter.new
|
58
|
+
@namespace = (writers[:namespace] or NamespacePrinter.new)
|
59
|
+
@logger = (writers[:logger] or NoProgress.new)
|
60
|
+
@page = (writers[:page] or NullWriter.new)
|
61
|
+
@revision = (writers[:revision] or NullWriter.new)
|
62
|
+
@lqt = (writers[:lqt] or NullWriter.new)
|
63
|
+
end
|
64
|
+
|
65
|
+
def namespace(key, casetype, name)
|
66
|
+
@namespace.write(key, casetype, name)
|
67
|
+
end
|
68
|
+
|
69
|
+
def page(ns, id, title, redirect, sha1)
|
70
|
+
@logger.report_pages(1)
|
71
|
+
@page.write(ns, id, title, redirect, sha1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def revision(id, page_id, n, timestamp, contributor, minor,
|
75
|
+
comment, text_deleted, bytes, textid, text)
|
76
|
+
@logger.report_revisions(1)
|
77
|
+
@revision.write(id, page_id, n, timestamp, contributor, minor,
|
78
|
+
comment, text_deleted, bytes, textid, text)
|
79
|
+
end
|
80
|
+
|
81
|
+
def lqt(threadSubject, threadParent, threadAncestor,
|
82
|
+
threadPage, threadID, threadSummaryPage,
|
83
|
+
threadAuthor, threadEditStatus, threadType,
|
84
|
+
threadSignature)
|
85
|
+
@lqt.write(threadSubject, threadParent, threadAncestor,
|
86
|
+
threadPage, threadID, threadSummaryPage,
|
87
|
+
threadAuthor, threadEditStatus, threadType,
|
88
|
+
threadSignature)
|
89
|
+
end
|
90
|
+
|
91
|
+
def done
|
92
|
+
@logger.report_done
|
93
|
+
end
|
94
|
+
|
95
|
+
def skipped(name)
|
96
|
+
# puts "wikiwriter: skipped element #{name}"
|
97
|
+
# raise 'what?'
|
98
|
+
@logger.report_skipped_element(name)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
class NoProgress
|
103
|
+
def report_pages(n)
|
104
|
+
end
|
105
|
+
|
106
|
+
def report_revisions(n)
|
107
|
+
end
|
108
|
+
|
109
|
+
def report_done
|
110
|
+
end
|
111
|
+
|
112
|
+
def report_skipped_element(name)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
class FinalProgress
|
117
|
+
def f(n)
|
118
|
+
parts = []
|
119
|
+
while n >= 1
|
120
|
+
parts.unshift(n % 1000)
|
121
|
+
n /= 1000
|
122
|
+
end
|
123
|
+
head = parts.shift
|
124
|
+
if parts.empty?
|
125
|
+
if head
|
126
|
+
head.to_s
|
127
|
+
else
|
128
|
+
n
|
129
|
+
end
|
130
|
+
else
|
131
|
+
[head, parts.map {|p| sprintf('%03d', p)}.join(',')].join(',')
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def total_skipped
|
136
|
+
@skipped_counts.values.reduce(0, :+)
|
137
|
+
end
|
138
|
+
|
139
|
+
def report_pages(n)
|
140
|
+
@pages += n
|
141
|
+
end
|
142
|
+
|
143
|
+
def report_revisions(n)
|
144
|
+
@revisions += n
|
145
|
+
end
|
146
|
+
|
147
|
+
def report_skipped_element(name)
|
148
|
+
@skipped_counts[name] += 1
|
149
|
+
end
|
150
|
+
|
151
|
+
def show_skipped
|
152
|
+
@skipped_counts.each do |name, count|
|
153
|
+
puts "#{name}: #{count}"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def report_done
|
158
|
+
duration = Time.now - @start_time
|
159
|
+
avg_rate = @revisions / duration
|
160
|
+
h = (duration / 60 / 60).floor
|
161
|
+
m = (duration % (60 * 60) / 60).floor
|
162
|
+
s = (duration % 60).floor
|
163
|
+
# FIXME: Print to STDERR or some log
|
164
|
+
skipped = total_skipped
|
165
|
+
if skipped > 0
|
166
|
+
puts "Couldn't process #{skipped} elements! Detailed breakdown:"
|
167
|
+
show_skipped
|
168
|
+
end
|
169
|
+
puts "Done! Took #{h}h#{m}m#{s}s. Averaged #{f avg_rate.round(0)} rps."
|
170
|
+
end
|
171
|
+
|
172
|
+
def initialize
|
173
|
+
@start_time = Time.now
|
174
|
+
@pages = 0
|
175
|
+
@revisions = 0
|
176
|
+
@skipped_counts = Hash.new 0
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
class RevisionProgress < FinalProgress
|
181
|
+
def announce_progress
|
182
|
+
now = Time.now
|
183
|
+
rps = (@revisions - @previous_revisions) / (now - @previous_time)
|
184
|
+
puts "Page #{f @pages}, rev #{f @revisions} (#{f rps.round(0)} rps)"
|
185
|
+
skipped = total_skipped
|
186
|
+
puts "#{f skipped} unprocessable elements so far."
|
187
|
+
show_skipped
|
188
|
+
@previous_time = now
|
189
|
+
@previous_revisions = @revisions
|
190
|
+
end
|
191
|
+
|
192
|
+
def report_revisions(n)
|
193
|
+
super(n)
|
194
|
+
|
195
|
+
if @revisions - @previous_revisions >= @interval
|
196
|
+
announce_progress
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def report_done
|
201
|
+
announce_progress
|
202
|
+
super
|
203
|
+
end
|
204
|
+
|
205
|
+
def initialize(interval)
|
206
|
+
super()
|
207
|
+
@interval = interval
|
208
|
+
@previous_time = @start_time
|
209
|
+
@previous_revisions = 0
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
class Namespace < WikiAvro::XML::Leaf
|
214
|
+
def name
|
215
|
+
'namespace'
|
216
|
+
end
|
217
|
+
|
218
|
+
def reset
|
219
|
+
# everything is overwritten each cycle anyway
|
220
|
+
end
|
221
|
+
|
222
|
+
def parse_attributes(w, p, r)
|
223
|
+
@key = r['key']
|
224
|
+
@case = r['case']
|
225
|
+
end
|
226
|
+
|
227
|
+
def parse_content(w, p, r)
|
228
|
+
name = r.read_string
|
229
|
+
WikiAvro::XML.skip_tag(w, r, false)
|
230
|
+
w.namespace(@key, @case, name)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
class NamespaceStream < WikiAvro::XML::Stream
|
235
|
+
def initialize
|
236
|
+
super([Namespace.new])
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
class Sitename < WikiAvro::XML::Inserter
|
241
|
+
def initialize
|
242
|
+
super('sitename')
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
class Base < WikiAvro::XML::Inserter
|
247
|
+
def initialize
|
248
|
+
super('base')
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
class Generator < WikiAvro::XML::Inserter
|
253
|
+
def initialize
|
254
|
+
super('generator')
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
class Case < WikiAvro::XML::Inserter
|
259
|
+
def initialize
|
260
|
+
super('case')
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
class Namespaces < WikiAvro::XML::Element
|
265
|
+
def name
|
266
|
+
'namespaces'
|
267
|
+
end
|
268
|
+
|
269
|
+
def initialize
|
270
|
+
super([NamespaceStream.new])
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
class SiteInfo < WikiAvro::XML::Element
|
275
|
+
attr_accessor :sitename
|
276
|
+
attr_accessor :base
|
277
|
+
attr_accessor :generator
|
278
|
+
attr_accessor :case
|
279
|
+
|
280
|
+
def name
|
281
|
+
'siteinfo'
|
282
|
+
end
|
283
|
+
|
284
|
+
def reset
|
285
|
+
@sitename = nil
|
286
|
+
@base = nil
|
287
|
+
@generator = nil
|
288
|
+
@case = nil
|
289
|
+
end
|
290
|
+
|
291
|
+
def initialize
|
292
|
+
super([Sitename.new, Base.new, Generator.new,
|
293
|
+
Case.new, Namespaces.new])
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
class Title < WikiAvro::XML::Inserter
|
298
|
+
def initialize
|
299
|
+
super('title')
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
class Ns < WikiAvro::XML::Inserter
|
304
|
+
def initialize
|
305
|
+
super('ns')
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
class Id < WikiAvro::XML::Inserter
|
310
|
+
def initialize
|
311
|
+
super('id')
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
class Redirect < WikiAvro::XML::Leaf
|
316
|
+
def name
|
317
|
+
'redirect'
|
318
|
+
end
|
319
|
+
|
320
|
+
def parse_attributes(w, p, r)
|
321
|
+
# puts "redirect: #{r['title']}"
|
322
|
+
p.redirect = r['title']
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
class Sha1 < WikiAvro::XML::Inserter
|
327
|
+
def initialize
|
328
|
+
super('sha1')
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
class PageFlags < WikiAvro::XML::Group
|
333
|
+
def initialize
|
334
|
+
super [{:element => Redirect.new, :min => 0, :max => 1},
|
335
|
+
{:element => Sha1.new, :min => 0, :max => 1}]
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
class Timestamp < WikiAvro::XML::Inserter
|
340
|
+
def initialize
|
341
|
+
super('timestamp')
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
class Username < WikiAvro::XML::Inserter
|
346
|
+
def initialize
|
347
|
+
super('username')
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
class Ip < WikiAvro::XML::Inserter
|
352
|
+
def initialize
|
353
|
+
super('ip')
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
class ContributorGroup < WikiAvro::XML::Group
|
358
|
+
def optional?
|
359
|
+
true
|
360
|
+
end
|
361
|
+
|
362
|
+
def initialize
|
363
|
+
super [{:element => Username.new, :min => 0, :max => 1},
|
364
|
+
{:element => Id.new, :min => 0, :max => 1},
|
365
|
+
{:element => Ip.new, :min => 0, :max => 1}]
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
class Contributor < WikiAvro::XML::Element
|
370
|
+
def name
|
371
|
+
'contributor'
|
372
|
+
end
|
373
|
+
|
374
|
+
attr_accessor :id
|
375
|
+
attr_accessor :username
|
376
|
+
attr_accessor :ip
|
377
|
+
|
378
|
+
def reset
|
379
|
+
@id = nil
|
380
|
+
@username = nil
|
381
|
+
@ip = nil
|
382
|
+
@deleted = nil
|
383
|
+
end
|
384
|
+
|
385
|
+
def parse_attributes(w, p, r)
|
386
|
+
@deleted = r['deleted']
|
387
|
+
end
|
388
|
+
|
389
|
+
def handle_content(w, p, r)
|
390
|
+
p.contributor = {:deleted => @deleted, :id => id,
|
391
|
+
:username => username, :ip => ip}
|
392
|
+
end
|
393
|
+
|
394
|
+
def initialize
|
395
|
+
super([ContributorGroup.new])
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
class Minor < WikiAvro::XML::Inserter
|
400
|
+
def initialize
|
401
|
+
super('minor')
|
402
|
+
end
|
403
|
+
end
|
404
|
+
|
405
|
+
class Comment < WikiAvro::XML::Leaf
|
406
|
+
def name
|
407
|
+
'comment'
|
408
|
+
end
|
409
|
+
|
410
|
+
def parse_attributes(w, p, r)
|
411
|
+
deleted = r['deleted']
|
412
|
+
comment = r.read_string
|
413
|
+
p.comment = {:deleted => deleted,
|
414
|
+
:comment => comment}
|
415
|
+
end
|
416
|
+
end
|
417
|
+
|
418
|
+
class RevisionFlags < WikiAvro::XML::Group
|
419
|
+
def initialize
|
420
|
+
super [{:element => Minor.new, :min => 0, :max => 1},
|
421
|
+
{:element => Comment.new, :min => 0, :max => 1}]
|
422
|
+
end
|
423
|
+
end
|
424
|
+
|
425
|
+
class Text < WikiAvro::XML::Inserter
|
426
|
+
def parse_attributes(w, p, r)
|
427
|
+
p.text_deleted = r['deleted']
|
428
|
+
p.textid = r['id']
|
429
|
+
p.bytes = r['bytes']
|
430
|
+
end
|
431
|
+
|
432
|
+
def initialize
|
433
|
+
super('text')
|
434
|
+
end
|
435
|
+
end
|
436
|
+
|
437
|
+
class Revision < WikiAvro::XML::Element
|
438
|
+
attr_accessor :id
|
439
|
+
attr_accessor :timestamp
|
440
|
+
attr_accessor :contributor
|
441
|
+
attr_accessor :minor
|
442
|
+
attr_accessor :comment
|
443
|
+
attr_accessor :text_deleted
|
444
|
+
attr_accessor :bytes
|
445
|
+
attr_accessor :textid
|
446
|
+
attr_accessor :text
|
447
|
+
|
448
|
+
def name
|
449
|
+
'revision'
|
450
|
+
end
|
451
|
+
|
452
|
+
def reset
|
453
|
+
id = nil
|
454
|
+
timestamp = nil
|
455
|
+
contributor = nil
|
456
|
+
minor = nil
|
457
|
+
comment = nil
|
458
|
+
text_deleted = nil
|
459
|
+
bytes = nil
|
460
|
+
textid = nil
|
461
|
+
text = nil
|
462
|
+
end
|
463
|
+
|
464
|
+
def handle_content(w, p, r)
|
465
|
+
p.revision_count += 1
|
466
|
+
n = p.revision_count
|
467
|
+
w.revision(id, p.id, n, timestamp, contributor, minor,
|
468
|
+
comment, text_deleted, bytes, textid, text)
|
469
|
+
end
|
470
|
+
|
471
|
+
def initialize
|
472
|
+
super([Id.new, Timestamp.new, Contributor.new,
|
473
|
+
RevisionFlags.new, Text.new])
|
474
|
+
end
|
475
|
+
end
|
476
|
+
|
477
|
+
class RevStream < WikiAvro::XML::Stream
|
478
|
+
def initialize
|
479
|
+
super([Revision.new])
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
class ThreadSubject < WikiAvro::XML::Inserter
|
484
|
+
def initialize
|
485
|
+
super('ThreadSubject', 'threadSubject')
|
486
|
+
end
|
487
|
+
end
|
488
|
+
|
489
|
+
class ThreadParent < WikiAvro::XML::Inserter
|
490
|
+
def initialize
|
491
|
+
super('ThreadParent', 'threadParent')
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
class ThreadAncestor < WikiAvro::XML::Inserter
|
496
|
+
def initialize
|
497
|
+
super('ThreadAncestor', 'threadAncestor')
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
class ThreadParentGroup < WikiAvro::XML::Group
|
502
|
+
def optional?
|
503
|
+
true
|
504
|
+
end
|
505
|
+
|
506
|
+
def initialize
|
507
|
+
super [{:element => ThreadParent.new, :min => 0, :max => 1},
|
508
|
+
{:element => ThreadAncestor.new, :min => 0, :max => 1}]
|
509
|
+
end
|
510
|
+
end
|
511
|
+
|
512
|
+
class ThreadPage < WikiAvro::XML::Inserter
|
513
|
+
def initialize
|
514
|
+
super('ThreadPage', 'threadPage')
|
515
|
+
end
|
516
|
+
end
|
517
|
+
|
518
|
+
class ThreadID < WikiAvro::XML::Inserter
|
519
|
+
def initialize
|
520
|
+
super('ThreadID', 'threadID')
|
521
|
+
end
|
522
|
+
end
|
523
|
+
|
524
|
+
class ThreadSummaryPage < WikiAvro::XML::Inserter
|
525
|
+
def initialize
|
526
|
+
super('ThreadSummaryPage', 'threadSummaryPage')
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
class ThreadSummaryPageGroup < WikiAvro::XML::Group
|
531
|
+
def optional?
|
532
|
+
true
|
533
|
+
end
|
534
|
+
|
535
|
+
def initialize
|
536
|
+
super [{:element => ThreadSummaryPage.new, :min => 0, :max => 1}]
|
537
|
+
end
|
538
|
+
end
|
539
|
+
|
540
|
+
class ThreadAuthor < WikiAvro::XML::Inserter
|
541
|
+
def initialize
|
542
|
+
super('ThreadAuthor', 'threadAuthor')
|
543
|
+
end
|
544
|
+
end
|
545
|
+
|
546
|
+
class ThreadEditStatus < WikiAvro::XML::Inserter
|
547
|
+
def initialize
|
548
|
+
super('ThreadEditStatus', 'threadEditStatus')
|
549
|
+
end
|
550
|
+
end
|
551
|
+
|
552
|
+
class ThreadType < WikiAvro::XML::Inserter
|
553
|
+
def initialize
|
554
|
+
super('ThreadType', 'threadType')
|
555
|
+
end
|
556
|
+
end
|
557
|
+
|
558
|
+
class ThreadSignature < WikiAvro::XML::Inserter
|
559
|
+
def initialize
|
560
|
+
super('ThreadSignature', 'threadSignature')
|
561
|
+
end
|
562
|
+
end
|
563
|
+
|
564
|
+
class DiscussionThreading < WikiAvro::XML::Element
|
565
|
+
attr_accessor :threadSubject, :threadParent, :threadAncestor,
|
566
|
+
:threadPage, :threadID, :threadSummaryPage,
|
567
|
+
:threadAuthor, :threadEditStatus, :threadType,
|
568
|
+
:threadSignature
|
569
|
+
def name
|
570
|
+
'DiscussionThreading'
|
571
|
+
end
|
572
|
+
|
573
|
+
def reset
|
574
|
+
threadSubject = nil
|
575
|
+
threadParent = nil
|
576
|
+
threadAncestor = nil
|
577
|
+
threadPage = nil
|
578
|
+
threadID = nil
|
579
|
+
threadSummaryPage = nil
|
580
|
+
threadAuthor = nil
|
581
|
+
threadEditStatus = nil
|
582
|
+
threadType = nil
|
583
|
+
threadSignature = nil
|
584
|
+
end
|
585
|
+
|
586
|
+
def handle_content(w, p, r)
|
587
|
+
w.lqt(threadSubject, threadParent, threadAncestor,
|
588
|
+
threadPage, threadID, threadSummaryPage,
|
589
|
+
threadAuthor, threadEditStatus, threadType,
|
590
|
+
threadSignature)
|
591
|
+
end
|
592
|
+
|
593
|
+
def initialize
|
594
|
+
super([ThreadSubject.new, ThreadParentGroup.new, ThreadPage.new,
|
595
|
+
ThreadID.new, ThreadSummaryPageGroup.new, ThreadAuthor.new,
|
596
|
+
ThreadEditStatus.new, ThreadType.new, ThreadSignature.new])
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
class DiscussionThreadingGroup < WikiAvro::XML::Group
|
601
|
+
def optional?
|
602
|
+
true
|
603
|
+
end
|
604
|
+
|
605
|
+
def initialize
|
606
|
+
super [{:element => DiscussionThreading.new, :min => 0, :max => 1}]
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
class Page < WikiAvro::XML::Element
|
611
|
+
attr_accessor :title
|
612
|
+
attr_accessor :ns
|
613
|
+
attr_accessor :id
|
614
|
+
attr_accessor :redirect
|
615
|
+
attr_accessor :sha1
|
616
|
+
attr_accessor :revision_count
|
617
|
+
|
618
|
+
def name
|
619
|
+
'page'
|
620
|
+
end
|
621
|
+
|
622
|
+
def reset
|
623
|
+
title = nil
|
624
|
+
ns = nil
|
625
|
+
id = nil
|
626
|
+
redirect = nil
|
627
|
+
sha1 = nil
|
628
|
+
revision_count = nil
|
629
|
+
@revision_count = 0
|
630
|
+
end
|
631
|
+
|
632
|
+
def handle_content(w, p, r)
|
633
|
+
w.page(ns, id, title, redirect, sha1)
|
634
|
+
end
|
635
|
+
|
636
|
+
def initialize
|
637
|
+
super([Title.new, Ns.new, Id.new, PageFlags.new,
|
638
|
+
RevStream.new, DiscussionThreadingGroup.new,
|
639
|
+
RevStream.new])
|
640
|
+
end
|
641
|
+
end
|
642
|
+
|
643
|
+
class PageStream < WikiAvro::XML::Stream
|
644
|
+
def initialize
|
645
|
+
super([Page.new])
|
646
|
+
end
|
647
|
+
end
|
648
|
+
|
649
|
+
class WikiDump < WikiAvro::XML::Element
|
650
|
+
attr_reader :version
|
651
|
+
|
652
|
+
def name
|
653
|
+
'mediawiki'
|
654
|
+
end
|
655
|
+
|
656
|
+
protected
|
657
|
+
|
658
|
+
def parse_attributes(w, p, r)
|
659
|
+
@version = r['version']
|
660
|
+
warn 'dump version != 0.6' if @version != '0.6'
|
661
|
+
end
|
662
|
+
|
663
|
+
def handle_content(w, p, r)
|
664
|
+
w.done
|
665
|
+
end
|
666
|
+
|
667
|
+
def initialize
|
668
|
+
super([SiteInfo.new, PageStream.new])
|
669
|
+
end
|
670
|
+
end
|
671
|
+
end
|
data/lib/wikiavro/xml.rb
ADDED
@@ -0,0 +1,282 @@
|
|
1
|
+
# Parser functions will assume to possibly start on their opening tag,
|
2
|
+
# and stop parsing right after their end tag. There're probably loads
|
3
|
+
# of bugs waiting for when you nest tags of the same name.
|
4
|
+
|
5
|
+
module WikiAvro::XML
|
6
|
+
def self.to_tag(reader)
|
7
|
+
# puts 'to_tag: moving to tag'
|
8
|
+
loop do
|
9
|
+
case reader.node_type
|
10
|
+
when XML::Reader::TYPE_ELEMENT
|
11
|
+
# puts "to_tag: got tag #{reader.name}"
|
12
|
+
return true
|
13
|
+
when XML::Reader::TYPE_END_ELEMENT
|
14
|
+
# puts "to_tag: got end tag #{reader.name}"
|
15
|
+
return false
|
16
|
+
end
|
17
|
+
|
18
|
+
break if !reader.read
|
19
|
+
end
|
20
|
+
|
21
|
+
# XML::Reader will probably raise its own exception before we ever
|
22
|
+
# could get here
|
23
|
+
raise EOFError.new('no opening tag')
|
24
|
+
end
|
25
|
+
|
26
|
+
# Do not call this while you are on the opening tag
|
27
|
+
def self.exit_tag(writer, reader, name)
|
28
|
+
nest = 1
|
29
|
+
|
30
|
+
# puts "exit_tag: exiting #{name}"
|
31
|
+
|
32
|
+
loop do
|
33
|
+
case reader.node_type
|
34
|
+
when XML::Reader::TYPE_ELEMENT
|
35
|
+
# puts "exit_tag: entered #{reader.name}"
|
36
|
+
writer.skipped(reader.name)
|
37
|
+
nest += 1 if reader.name == name
|
38
|
+
when XML::Reader::TYPE_END_ELEMENT
|
39
|
+
# puts "exit_tag: exited #{reader.name}"
|
40
|
+
nest -= 1 if reader.name == name
|
41
|
+
end
|
42
|
+
reader.read
|
43
|
+
if nest == 0
|
44
|
+
# puts "exit_tag: successful exit, now at #{reader.name}"
|
45
|
+
break
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Call this to skip when reader is on the opening tag
|
51
|
+
def self.skip_tag(writer, reader, skipping)
|
52
|
+
nest = 1
|
53
|
+
name = reader.name
|
54
|
+
|
55
|
+
# puts "skip_tag: skipping #{name}"
|
56
|
+
|
57
|
+
if reader.empty_element?
|
58
|
+
# puts "skip_tag: element was empty; skipped"
|
59
|
+
writer.skipped(name) if skipping
|
60
|
+
reader.read
|
61
|
+
return
|
62
|
+
end
|
63
|
+
|
64
|
+
while reader.read
|
65
|
+
case reader.node_type
|
66
|
+
when XML::Reader::TYPE_ELEMENT
|
67
|
+
# puts "skip_tag: entered #{reader.name}"
|
68
|
+
writer.skipped(reader.name) if skipping
|
69
|
+
nest += 1 if reader.name == name
|
70
|
+
when XML::Reader::TYPE_END_ELEMENT
|
71
|
+
# puts "skip_tag: exited #{reader.name}"
|
72
|
+
nest -= 1 if reader.name == name
|
73
|
+
end
|
74
|
+
if nest == 0
|
75
|
+
reader.read
|
76
|
+
break
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class MissingElement < Exception
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.to_element(writer, reader, name)
|
85
|
+
while WikiAvro::XML::to_tag(reader)
|
86
|
+
# puts "to_element: saw #{reader.name}"
|
87
|
+
if reader.name == name
|
88
|
+
return
|
89
|
+
else
|
90
|
+
# puts "to_element: skipping #{reader.name}"
|
91
|
+
WikiAvro::XML.skip_tag(writer, reader, true)
|
92
|
+
# puts "to_element: skipped"
|
93
|
+
end
|
94
|
+
|
95
|
+
break if !reader.read
|
96
|
+
end
|
97
|
+
|
98
|
+
raise MissingElement.new(name)
|
99
|
+
end
|
100
|
+
|
101
|
+
class Element
|
102
|
+
attr_reader :attr
|
103
|
+
|
104
|
+
def name
|
105
|
+
raise NotImplementedError.new('name')
|
106
|
+
end
|
107
|
+
|
108
|
+
def optional?
|
109
|
+
false
|
110
|
+
end
|
111
|
+
|
112
|
+
def parse(output, parent, reader)
|
113
|
+
if parent.nil? && reader.name != self.name
|
114
|
+
raise RuntimeError.new('reader.name != self.name')
|
115
|
+
else
|
116
|
+
WikiAvro::XML::to_element(output, reader, self.name)
|
117
|
+
end
|
118
|
+
|
119
|
+
reset
|
120
|
+
@attr = parse_attributes(output, parent, reader)
|
121
|
+
parse_content(output, parent, reader)
|
122
|
+
handle_content(output, parent, reader)
|
123
|
+
end
|
124
|
+
|
125
|
+
protected
|
126
|
+
|
127
|
+
# Instances will be reused. Subclasses that keep state which needs
|
128
|
+
# to be discarded after each parse should implement this.
|
129
|
+
def reset
|
130
|
+
end
|
131
|
+
|
132
|
+
def parse_attributes(w, p, r)
|
133
|
+
# no attributes parsed
|
134
|
+
end
|
135
|
+
|
136
|
+
# parse_content should move the reader away from the children's
|
137
|
+
# parent's opening tag. It should leave reader positioned after
|
138
|
+
# the closing tag.
|
139
|
+
def parse_content(w, p, r)
|
140
|
+
if r.empty_element?
|
141
|
+
@children.each do |c|
|
142
|
+
raise MissingElement.new(c.name) if !c.optional?
|
143
|
+
end
|
144
|
+
r.read
|
145
|
+
return
|
146
|
+
end
|
147
|
+
|
148
|
+
# Move away from our opening tag
|
149
|
+
r.read
|
150
|
+
@children.each do |c|
|
151
|
+
# puts "element: parsing #{c.class}"
|
152
|
+
c.parse(w, self, r)
|
153
|
+
# puts "parsed #{c.class}"
|
154
|
+
end
|
155
|
+
|
156
|
+
if r.empty_element? && r.name == self.name
|
157
|
+
# puts "got empty: #{r.name}"
|
158
|
+
r.read
|
159
|
+
# puts "now got this: #{r.name}"
|
160
|
+
else
|
161
|
+
# puts "mopping up #{self.name}"
|
162
|
+
WikiAvro::XML.exit_tag(w, r, self.name)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def handle_content(w, p, r)
|
167
|
+
# nothing done
|
168
|
+
end
|
169
|
+
|
170
|
+
private
|
171
|
+
|
172
|
+
def initialize(children)
|
173
|
+
@children = children
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class Leaf < Element
|
178
|
+
def initialize
|
179
|
+
super([])
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
class Inserter < Leaf
|
184
|
+
attr_reader :name
|
185
|
+
|
186
|
+
def parse_content(w, p, r)
|
187
|
+
p.send(@writer, r.read_string)
|
188
|
+
# puts "inserter: exiting #{@name}"
|
189
|
+
WikiAvro::XML.skip_tag(w, r, false)
|
190
|
+
# puts "exited"
|
191
|
+
end
|
192
|
+
|
193
|
+
def initialize(name, target=name)
|
194
|
+
super()
|
195
|
+
@name = name
|
196
|
+
@writer = (target + '=').to_sym
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
class Stream
|
201
|
+
def optional?
|
202
|
+
true
|
203
|
+
end
|
204
|
+
|
205
|
+
def parse(output, parent, reader)
|
206
|
+
# puts "stream: parsing #{self.class}"
|
207
|
+
|
208
|
+
while WikiAvro::XML::to_tag(reader)
|
209
|
+
e = @elements[reader.name]
|
210
|
+
|
211
|
+
if e.nil?
|
212
|
+
# puts "stream: rejected #{reader.name}"
|
213
|
+
return
|
214
|
+
else
|
215
|
+
# puts "stream: accepted #{reader.name}"
|
216
|
+
e.parse(output, parent, reader)
|
217
|
+
reader.read
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# puts "stream: ran to parent end"
|
222
|
+
end
|
223
|
+
|
224
|
+
private
|
225
|
+
|
226
|
+
def initialize(elements)
|
227
|
+
@elements = {}
|
228
|
+
elements.each { |e| @elements[e.name] = e }
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
class TooManyElements < Exception
|
233
|
+
end
|
234
|
+
|
235
|
+
class TooFewElements < Exception
|
236
|
+
end
|
237
|
+
|
238
|
+
class Group
|
239
|
+
# remember to override this if untrue, especially if it might be
|
240
|
+
# within an empty element
|
241
|
+
def optional?
|
242
|
+
false
|
243
|
+
end
|
244
|
+
|
245
|
+
def parse(output, parent, reader)
|
246
|
+
@n.keys.each {|k| @n[k] = 0}
|
247
|
+
|
248
|
+
while WikiAvro::XML::to_tag(reader)
|
249
|
+
e = @elements[reader.name]
|
250
|
+
|
251
|
+
if e.nil?
|
252
|
+
@elements.each do |k, v|
|
253
|
+
raise TooFewElements.new(k) if @n[k] < v[:min]
|
254
|
+
# this ought to be a redundant check
|
255
|
+
raise TooManyElements.new(k) if @n[k] > v[:max]
|
256
|
+
end
|
257
|
+
# puts "group: rejected #{reader.name}"
|
258
|
+
return
|
259
|
+
else
|
260
|
+
# puts "group: accepted #{reader.name}"
|
261
|
+
name = reader.name
|
262
|
+
@n[name] += 1
|
263
|
+
raise TooManyElements.new(name) if @n[name] > e[:max]
|
264
|
+
e[:element].parse(output, parent, reader)
|
265
|
+
reader.read
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
# puts 'group: ran to parent end'
|
270
|
+
end
|
271
|
+
|
272
|
+
def initialize(elements)
|
273
|
+
@elements = {}
|
274
|
+
@n = Hash.new 0
|
275
|
+
elements.each do |e|
|
276
|
+
name = e[:element].name
|
277
|
+
@elements[name] = e
|
278
|
+
@n[name] = 0
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wikiavro
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Someon
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: libxml-ruby
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: avro
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.7'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.7'
|
41
|
+
description:
|
42
|
+
email: someon@openmailbox.org
|
43
|
+
executables:
|
44
|
+
- wikiavro
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- bin/wikiavro
|
49
|
+
- lib/wikiavro.rb
|
50
|
+
- lib/wikiavro/avro.rb
|
51
|
+
- lib/wikiavro/mediawiki.rb
|
52
|
+
- lib/wikiavro/xml.rb
|
53
|
+
homepage:
|
54
|
+
licenses:
|
55
|
+
- GPL-3.0+
|
56
|
+
metadata: {}
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
requirements: []
|
72
|
+
rubyforge_project:
|
73
|
+
rubygems_version: 2.2.2
|
74
|
+
signing_key:
|
75
|
+
specification_version: 4
|
76
|
+
summary: Convert MediaWiki XML dumps to Avro
|
77
|
+
test_files: []
|