wikiavro 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c64ef5f767535d9432d8f589411f76dedb6a4b04
4
+ data.tar.gz: b846af138fe797fa09e9f248bcdb3298e988b5fb
5
+ SHA512:
6
+ metadata.gz: b5f7cba59679bc913ceaddf95e97f8ed7c63d9f54660d81680db419da09abf2c61ddadd2f2a75b0e199fbfe9f710363edcd889550f76f44acdd6a34e591f69a2
7
+ data.tar.gz: 8b63a6ae13ce31cc31eb2f5698a0bd586c9645dfbce1be6ae52e45f90315af3015ed1e1f17badaa4f06425483369a369af6adefd0dae2c8ef0555d8f05193f01
data/bin/wikiavro ADDED
@@ -0,0 +1,90 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'wikiavro'
4
+ require 'xml'
5
+ require 'optparse'
6
+
7
+ Version = WikiAvro::Version
8
+
9
+ logger = WikiAvro::MediaWiki::RevisionProgress.new 10000
10
+ overwrite = false
11
+ deflate = false
12
+ ns = nil
13
+ page = nil
14
+ rev = nil
15
+ lqt = nil
16
+
17
+ opt_parser = OptionParser.new do |opts|
18
+ opts.banner = 'Usage: wikiavro [options] [INFILE]'
19
+
20
+ opts.on('-q', '--quiet', "Don't report progress") do |v|
21
+ logger = WikiAvro::MediaWiki::NoProgress.new
22
+ end
23
+
24
+ opts.on('-v LEVEL', '--verbose=LEVEL', Integer,
25
+ 'Report progress after every LEVEL revisions parsed') do |lvl|
26
+ logger = WikiAvro::MediaWiki::RevisionProgress.new lvl.to_i
27
+ end
28
+
29
+ opts.on('-o', '--overwrite') do |o|
30
+ overwrite = o
31
+ end
32
+
33
+ opts.on('-d', '--deflate', 'Enable Avro internal compression') do |d|
34
+ deflate = d
35
+ end
36
+
37
+ opts.on('-n OUTFILE', '--namespaces=OUTFILE') do |path|
38
+ ns = path
39
+ end
40
+
41
+ opts.on('-p OUTFILE', '--pages=OUTFILE') do |path|
42
+ page = path
43
+ end
44
+
45
+ opts.on('-r OUTFILE', '--revisions=OUTFILE') do |path|
46
+ rev = path
47
+ end
48
+
49
+ opts.on('-l OUTFILE', '--liquidthreads=OUTFILE') do |path|
50
+ lqt = path
51
+ end
52
+ end
53
+
54
+ opt_parser.parse!
55
+
56
+ abort('You must specify --namespaces') if ns.nil?
57
+ abort('You must specify --pages') if page.nil?
58
+ abort('You must specify --revisions') if rev.nil?
59
+ abort('You must specify --liquidthreads') if lqt.nil?
60
+
61
+ if !overwrite
62
+ [ns, page, rev, lqt].each do |path|
63
+ if File.exists? path
64
+ abort("#{path} already exists! pass --overwrite to proceed anyway")
65
+ end
66
+ end
67
+ end
68
+
69
+ ns = WikiAvro::Avro::NamespaceWriter.new ns, deflate
70
+ page = WikiAvro::Avro::PageWriter.new page, deflate
71
+ rev = WikiAvro::Avro::RevisionWriter.new rev, deflate
72
+ lqt = WikiAvro::Avro::LqtWriter.new lqt, deflate
73
+
74
+ writer = WikiAvro::MediaWiki::WikiWriter.new :logger => logger,
75
+ :namespace => ns,
76
+ :page => page,
77
+ :revision => rev,
78
+ :lqt => lqt
79
+
80
+ mw = WikiAvro::MediaWiki::WikiDump.new
81
+
82
+ xml = XML::Reader.io(ARGF)
83
+ xml.read
84
+
85
+ mw.parse(writer, nil, xml)
86
+
87
+ ns.close
88
+ page.close
89
+ rev.close
90
+ lqt.close
data/lib/wikiavro.rb ADDED
@@ -0,0 +1,6 @@
1
+ module WikiAvro
2
+ Version = '0.0.1'
3
+ end
4
+
5
+ require 'wikiavro/mediawiki'
6
+ require 'wikiavro/avro'
@@ -0,0 +1,199 @@
1
+ require 'avro'
2
+
3
+ module WikiAvro::Avro
4
+ NAMESPACE_SCHEMA = <<-EOS
5
+ {
6
+ "namespace": "org.rationalwiki",
7
+ "name": "Namespace",
8
+ "type": "record",
9
+ "fields": [
10
+ {"name": "key", "type": "int"},
11
+ {"name": "case", "type": "string"},
12
+ {"name": "name", "type": "string"}
13
+ ]
14
+ }
15
+ EOS
16
+
17
+ PAGE_SCHEMA = <<-EOS
18
+ {
19
+ "namespace": "org.rationalwiki",
20
+ "name": "Page",
21
+ "type": "record",
22
+ "fields": [
23
+ {"name": "id", "type": "long"},
24
+ {"name": "ns", "type": "long"},
25
+ {"name": "title", "type": "string"},
26
+ {"name": "redirect", "type": ["null", "string"]},
27
+ {"name": "sha1", "type": ["null", "string"]}
28
+ ]
29
+ }
30
+ EOS
31
+
32
+ REVISION_SCHEMA = <<-EOS
33
+ {
34
+ "namespace": "org.rationalwiki",
35
+ "name": "Revision",
36
+ "type": "record",
37
+ "fields": [
38
+ {"name": "id", "type": "long"},
39
+ {"name": "page_id", "type": "long"},
40
+ {"name": "n", "type": "long"},
41
+ {"name": "timestamp", "type": "string"},
42
+ {"name": "contributor", "type": ["null", {
43
+ "namespace": "org.rationalwiki",
44
+ "name": "Contributor",
45
+ "type": "record",
46
+ "fields": [
47
+ {"name": "id", "type": ["null", "long"]},
48
+ {"name": "username", "type": ["null", "string"]},
49
+ {"name": "ip", "type": ["null", "string"]}
50
+ ]
51
+ }]},
52
+ {"name": "minor", "type": "boolean"},
53
+ {"name": "comment", "type": ["null", "string"]},
54
+ {"name": "bytes", "type": "long"},
55
+ {"name": "textid", "type": ["null", "string"]},
56
+ {"name": "text", "type": ["null", "string"]}
57
+ ]
58
+ }
59
+ EOS
60
+
61
+ LQT_SCHEMA = <<-EOS
62
+ {
63
+ "namespace": "org.rationalwiki",
64
+ "name": "Threading",
65
+ "type": "record",
66
+ "fields": [
67
+ {"name": "subject", "type": "string"},
68
+ {"name": "parent", "type": ["null", "long"]},
69
+ {"name": "ancestor", "type": ["null", "long"]},
70
+ {"name": "page", "type": "string"},
71
+ {"name": "id", "type": "long"},
72
+ {"name": "summary_page", "type": ["null", "string"]},
73
+ {"name": "author", "type": "string"},
74
+ {"name": "edit_status", "type": "string"},
75
+ {"name": "type", "type": "string"},
76
+ {"name": "signature", "type": ["null", "string"]}
77
+ ]
78
+ }
79
+ EOS
80
+
81
+ class AvroWriter
82
+ def schema
83
+ raise NotImplementedError
84
+ end
85
+
86
+ def close
87
+ @writer.close
88
+ end
89
+
90
+ protected
91
+
92
+ def encode(data)
93
+ @writer << data
94
+ end
95
+
96
+ def initialize(path, deflate=false)
97
+ if !deflate
98
+ @writer = Avro::DataFile.open(path, 'w', schema)
99
+ else
100
+ @writer = Avro::DataFile.open(path, 'w', schema, 'deflate')
101
+ end
102
+ end
103
+ end
104
+
105
+ class NamespaceWriter < AvroWriter
106
+ def schema
107
+ NAMESPACE_SCHEMA
108
+ end
109
+
110
+ def write(key, casetype, name)
111
+ encode 'key' => key.to_i,
112
+ 'case' => casetype,
113
+ 'name' => name || ''
114
+ end
115
+ end
116
+
117
+ class PageWriter < AvroWriter
118
+ def schema
119
+ PAGE_SCHEMA
120
+ end
121
+
122
+ def write(ns, id, title, redirect, sha1)
123
+ encode 'id' => id.to_i,
124
+ 'ns' => ns.to_i,
125
+ 'title' => title,
126
+ 'redirect' => redirect,
127
+ 'sha1' => sha1
128
+ end
129
+ end
130
+
131
+ class RevisionWriter < AvroWriter
132
+ def schema
133
+ REVISION_SCHEMA
134
+ end
135
+
136
+ def write(id, page_id, n, timestamp, contributor, minor,
137
+ comment, text_deleted, bytes, textid, text)
138
+ if !contributor[:deleted].nil? && !(contributor[:id].nil? &&
139
+ contributor[:username].nil? &&
140
+ contributor[:ip].nil?)
141
+ raise 'deleted contributor has content'
142
+ end
143
+
144
+ if contributor[:deleted].nil?
145
+ contributor.delete :deleted
146
+ contributor = {
147
+ 'username' => contributor[:username],
148
+ 'id' => contributor[:id].to_i,
149
+ 'ip' => contributor[:ip]
150
+ }
151
+ else
152
+ contributor = nil
153
+ end
154
+
155
+ if comment[:deleted].nil?
156
+ comment = comment[:comment]
157
+ else
158
+ raise 'deleted comment has content' if comment[:comment]
159
+ comment = nil
160
+ end
161
+
162
+ text = nil if !text_deleted.nil?
163
+
164
+ encode 'id' => id.to_i,
165
+ 'page_id' => page_id.to_i,
166
+ 'n' => n.to_i,
167
+ 'timestamp' => timestamp,
168
+ 'contributor' => contributor,
169
+ 'minor' => minor.nil?,
170
+ 'comment' => comment,
171
+ 'bytes' => bytes.to_i,
172
+ 'textid' => textid,
173
+ 'text' => text
174
+ end
175
+ end
176
+
177
+ class LqtWriter < AvroWriter
178
+ def schema
179
+ LQT_SCHEMA
180
+ end
181
+
182
+ def write(subject, parent, ancestor, page, id, summary_page,
183
+ author, edit_status, type, signature)
184
+ parent = parent.to_i if parent
185
+ ancestor = ancestor.to_i if ancestor
186
+
187
+ encode 'subject' => subject,
188
+ 'parent' => parent,
189
+ 'ancestor' => ancestor,
190
+ 'page' => page,
191
+ 'id' => id.to_i,
192
+ 'summary_page' => summary_page,
193
+ 'author' => author,
194
+ 'edit_status' => edit_status,
195
+ 'type' => type,
196
+ 'signature' => signature
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,671 @@
1
+ require 'wikiavro/xml'
2
+
3
+ # RW declares schema 0.6 but has redirects as in 0.7
4
+ #
5
+ # RW has <sha1> tags right after <redirect> - they are indented to a
6
+ # different level too. The <sha1>s are missing within <revision>s
7
+ # where they should be.
8
+ #
9
+ # Schema claims discussionthreadinginfo, but actual tag is
10
+ # DiscussionThreading. Schema does not describe ThreadSummaryPage or
11
+ # ThreadSignature. Schema does not describe which LQT tags are
12
+ # omissible. Schema says thread info should always come after
13
+ # revisions, but it does not.
14
+
15
+ module WikiAvro::MediaWiki
16
+ class NamespacePrinter
17
+ def write(key, casetype, name)
18
+ puts "namespace #{key}: \"#{name}\" #{casetype}"
19
+ end
20
+ end
21
+
22
+ class PagePrinter
23
+ def write(ns, id, title, redirect, sha1)
24
+ puts "page \"#{title}\": #{id} #{ns} #{redirect} #{sha1}"
25
+ end
26
+ end
27
+
28
+ class RevisionPrinter
29
+ def write(id, page_id, n, timestamp, contributor, minor,
30
+ comment, text_deleted, bytes, textid, text)
31
+ puts "rev #{page_id} #{n}: #{timestamp} " +
32
+ "#{bytes} #{contributor[:username]}"
33
+ end
34
+ end
35
+
36
+ class LqtPrinter
37
+ def write(threadSubject, threadParent, threadAncestor,
38
+ threadPage, threadID, threadSummaryPage,
39
+ threadAuthor, threadEditStatus, threadType,
40
+ threadSignature)
41
+ puts "thread #{threadSubject} #{threadParent} #{threadAncestor} " +
42
+ "#{threadAuthor} #{threadEditStatus} #{threadType}"
43
+ end
44
+ end
45
+
46
+ class NullWriter
47
+ def method_missing(target, *args, &block)
48
+ # All methods return nil
49
+ end
50
+
51
+ def initialize
52
+ end
53
+ end
54
+
55
+ class WikiWriter
56
+ def initialize(writers)
57
+ null_writer = NullWriter.new
58
+ @namespace = (writers[:namespace] or NamespacePrinter.new)
59
+ @logger = (writers[:logger] or NoProgress.new)
60
+ @page = (writers[:page] or NullWriter.new)
61
+ @revision = (writers[:revision] or NullWriter.new)
62
+ @lqt = (writers[:lqt] or NullWriter.new)
63
+ end
64
+
65
+ def namespace(key, casetype, name)
66
+ @namespace.write(key, casetype, name)
67
+ end
68
+
69
+ def page(ns, id, title, redirect, sha1)
70
+ @logger.report_pages(1)
71
+ @page.write(ns, id, title, redirect, sha1)
72
+ end
73
+
74
+ def revision(id, page_id, n, timestamp, contributor, minor,
75
+ comment, text_deleted, bytes, textid, text)
76
+ @logger.report_revisions(1)
77
+ @revision.write(id, page_id, n, timestamp, contributor, minor,
78
+ comment, text_deleted, bytes, textid, text)
79
+ end
80
+
81
+ def lqt(threadSubject, threadParent, threadAncestor,
82
+ threadPage, threadID, threadSummaryPage,
83
+ threadAuthor, threadEditStatus, threadType,
84
+ threadSignature)
85
+ @lqt.write(threadSubject, threadParent, threadAncestor,
86
+ threadPage, threadID, threadSummaryPage,
87
+ threadAuthor, threadEditStatus, threadType,
88
+ threadSignature)
89
+ end
90
+
91
+ def done
92
+ @logger.report_done
93
+ end
94
+
95
+ def skipped(name)
96
+ # puts "wikiwriter: skipped element #{name}"
97
+ # raise 'what?'
98
+ @logger.report_skipped_element(name)
99
+ end
100
+ end
101
+
102
+ class NoProgress
103
+ def report_pages(n)
104
+ end
105
+
106
+ def report_revisions(n)
107
+ end
108
+
109
+ def report_done
110
+ end
111
+
112
+ def report_skipped_element(name)
113
+ end
114
+ end
115
+
116
+ class FinalProgress
117
+ def f(n)
118
+ parts = []
119
+ while n >= 1
120
+ parts.unshift(n % 1000)
121
+ n /= 1000
122
+ end
123
+ head = parts.shift
124
+ if parts.empty?
125
+ if head
126
+ head.to_s
127
+ else
128
+ n
129
+ end
130
+ else
131
+ [head, parts.map {|p| sprintf('%03d', p)}.join(',')].join(',')
132
+ end
133
+ end
134
+
135
+ def total_skipped
136
+ @skipped_counts.values.reduce(0, :+)
137
+ end
138
+
139
+ def report_pages(n)
140
+ @pages += n
141
+ end
142
+
143
+ def report_revisions(n)
144
+ @revisions += n
145
+ end
146
+
147
+ def report_skipped_element(name)
148
+ @skipped_counts[name] += 1
149
+ end
150
+
151
+ def show_skipped
152
+ @skipped_counts.each do |name, count|
153
+ puts "#{name}: #{count}"
154
+ end
155
+ end
156
+
157
+ def report_done
158
+ duration = Time.now - @start_time
159
+ avg_rate = @revisions / duration
160
+ h = (duration / 60 / 60).floor
161
+ m = (duration % (60 * 60) / 60).floor
162
+ s = (duration % 60).floor
163
+ # FIXME: Print to STDERR or some log
164
+ skipped = total_skipped
165
+ if skipped > 0
166
+ puts "Couldn't process #{skipped} elements! Detailed breakdown:"
167
+ show_skipped
168
+ end
169
+ puts "Done! Took #{h}h#{m}m#{s}s. Averaged #{f avg_rate.round(0)} rps."
170
+ end
171
+
172
+ def initialize
173
+ @start_time = Time.now
174
+ @pages = 0
175
+ @revisions = 0
176
+ @skipped_counts = Hash.new 0
177
+ end
178
+ end
179
+
180
+ class RevisionProgress < FinalProgress
181
+ def announce_progress
182
+ now = Time.now
183
+ rps = (@revisions - @previous_revisions) / (now - @previous_time)
184
+ puts "Page #{f @pages}, rev #{f @revisions} (#{f rps.round(0)} rps)"
185
+ skipped = total_skipped
186
+ puts "#{f skipped} unprocessable elements so far."
187
+ show_skipped
188
+ @previous_time = now
189
+ @previous_revisions = @revisions
190
+ end
191
+
192
+ def report_revisions(n)
193
+ super(n)
194
+
195
+ if @revisions - @previous_revisions >= @interval
196
+ announce_progress
197
+ end
198
+ end
199
+
200
+ def report_done
201
+ announce_progress
202
+ super
203
+ end
204
+
205
+ def initialize(interval)
206
+ super()
207
+ @interval = interval
208
+ @previous_time = @start_time
209
+ @previous_revisions = 0
210
+ end
211
+ end
212
+
213
+ class Namespace < WikiAvro::XML::Leaf
214
+ def name
215
+ 'namespace'
216
+ end
217
+
218
+ def reset
219
+ # everything is overwritten each cycle anyway
220
+ end
221
+
222
+ def parse_attributes(w, p, r)
223
+ @key = r['key']
224
+ @case = r['case']
225
+ end
226
+
227
+ def parse_content(w, p, r)
228
+ name = r.read_string
229
+ WikiAvro::XML.skip_tag(w, r, false)
230
+ w.namespace(@key, @case, name)
231
+ end
232
+ end
233
+
234
+ class NamespaceStream < WikiAvro::XML::Stream
235
+ def initialize
236
+ super([Namespace.new])
237
+ end
238
+ end
239
+
240
+ class Sitename < WikiAvro::XML::Inserter
241
+ def initialize
242
+ super('sitename')
243
+ end
244
+ end
245
+
246
+ class Base < WikiAvro::XML::Inserter
247
+ def initialize
248
+ super('base')
249
+ end
250
+ end
251
+
252
+ class Generator < WikiAvro::XML::Inserter
253
+ def initialize
254
+ super('generator')
255
+ end
256
+ end
257
+
258
+ class Case < WikiAvro::XML::Inserter
259
+ def initialize
260
+ super('case')
261
+ end
262
+ end
263
+
264
+ class Namespaces < WikiAvro::XML::Element
265
+ def name
266
+ 'namespaces'
267
+ end
268
+
269
+ def initialize
270
+ super([NamespaceStream.new])
271
+ end
272
+ end
273
+
274
+ class SiteInfo < WikiAvro::XML::Element
275
+ attr_accessor :sitename
276
+ attr_accessor :base
277
+ attr_accessor :generator
278
+ attr_accessor :case
279
+
280
+ def name
281
+ 'siteinfo'
282
+ end
283
+
284
+ def reset
285
+ @sitename = nil
286
+ @base = nil
287
+ @generator = nil
288
+ @case = nil
289
+ end
290
+
291
+ def initialize
292
+ super([Sitename.new, Base.new, Generator.new,
293
+ Case.new, Namespaces.new])
294
+ end
295
+ end
296
+
297
+ class Title < WikiAvro::XML::Inserter
298
+ def initialize
299
+ super('title')
300
+ end
301
+ end
302
+
303
+ class Ns < WikiAvro::XML::Inserter
304
+ def initialize
305
+ super('ns')
306
+ end
307
+ end
308
+
309
+ class Id < WikiAvro::XML::Inserter
310
+ def initialize
311
+ super('id')
312
+ end
313
+ end
314
+
315
+ class Redirect < WikiAvro::XML::Leaf
316
+ def name
317
+ 'redirect'
318
+ end
319
+
320
+ def parse_attributes(w, p, r)
321
+ # puts "redirect: #{r['title']}"
322
+ p.redirect = r['title']
323
+ end
324
+ end
325
+
326
+ class Sha1 < WikiAvro::XML::Inserter
327
+ def initialize
328
+ super('sha1')
329
+ end
330
+ end
331
+
332
+ class PageFlags < WikiAvro::XML::Group
333
+ def initialize
334
+ super [{:element => Redirect.new, :min => 0, :max => 1},
335
+ {:element => Sha1.new, :min => 0, :max => 1}]
336
+ end
337
+ end
338
+
339
+ class Timestamp < WikiAvro::XML::Inserter
340
+ def initialize
341
+ super('timestamp')
342
+ end
343
+ end
344
+
345
+ class Username < WikiAvro::XML::Inserter
346
+ def initialize
347
+ super('username')
348
+ end
349
+ end
350
+
351
+ class Ip < WikiAvro::XML::Inserter
352
+ def initialize
353
+ super('ip')
354
+ end
355
+ end
356
+
357
+ class ContributorGroup < WikiAvro::XML::Group
358
+ def optional?
359
+ true
360
+ end
361
+
362
+ def initialize
363
+ super [{:element => Username.new, :min => 0, :max => 1},
364
+ {:element => Id.new, :min => 0, :max => 1},
365
+ {:element => Ip.new, :min => 0, :max => 1}]
366
+ end
367
+ end
368
+
369
+ class Contributor < WikiAvro::XML::Element
370
+ def name
371
+ 'contributor'
372
+ end
373
+
374
+ attr_accessor :id
375
+ attr_accessor :username
376
+ attr_accessor :ip
377
+
378
+ def reset
379
+ @id = nil
380
+ @username = nil
381
+ @ip = nil
382
+ @deleted = nil
383
+ end
384
+
385
+ def parse_attributes(w, p, r)
386
+ @deleted = r['deleted']
387
+ end
388
+
389
+ def handle_content(w, p, r)
390
+ p.contributor = {:deleted => @deleted, :id => id,
391
+ :username => username, :ip => ip}
392
+ end
393
+
394
+ def initialize
395
+ super([ContributorGroup.new])
396
+ end
397
+ end
398
+
399
+ class Minor < WikiAvro::XML::Inserter
400
+ def initialize
401
+ super('minor')
402
+ end
403
+ end
404
+
405
+ class Comment < WikiAvro::XML::Leaf
406
+ def name
407
+ 'comment'
408
+ end
409
+
410
+ def parse_attributes(w, p, r)
411
+ deleted = r['deleted']
412
+ comment = r.read_string
413
+ p.comment = {:deleted => deleted,
414
+ :comment => comment}
415
+ end
416
+ end
417
+
418
+ class RevisionFlags < WikiAvro::XML::Group
419
+ def initialize
420
+ super [{:element => Minor.new, :min => 0, :max => 1},
421
+ {:element => Comment.new, :min => 0, :max => 1}]
422
+ end
423
+ end
424
+
425
+ class Text < WikiAvro::XML::Inserter
426
+ def parse_attributes(w, p, r)
427
+ p.text_deleted = r['deleted']
428
+ p.textid = r['id']
429
+ p.bytes = r['bytes']
430
+ end
431
+
432
+ def initialize
433
+ super('text')
434
+ end
435
+ end
436
+
437
+ class Revision < WikiAvro::XML::Element
438
+ attr_accessor :id
439
+ attr_accessor :timestamp
440
+ attr_accessor :contributor
441
+ attr_accessor :minor
442
+ attr_accessor :comment
443
+ attr_accessor :text_deleted
444
+ attr_accessor :bytes
445
+ attr_accessor :textid
446
+ attr_accessor :text
447
+
448
+ def name
449
+ 'revision'
450
+ end
451
+
452
+ def reset
453
+ id = nil
454
+ timestamp = nil
455
+ contributor = nil
456
+ minor = nil
457
+ comment = nil
458
+ text_deleted = nil
459
+ bytes = nil
460
+ textid = nil
461
+ text = nil
462
+ end
463
+
464
+ def handle_content(w, p, r)
465
+ p.revision_count += 1
466
+ n = p.revision_count
467
+ w.revision(id, p.id, n, timestamp, contributor, minor,
468
+ comment, text_deleted, bytes, textid, text)
469
+ end
470
+
471
+ def initialize
472
+ super([Id.new, Timestamp.new, Contributor.new,
473
+ RevisionFlags.new, Text.new])
474
+ end
475
+ end
476
+
477
+ class RevStream < WikiAvro::XML::Stream
478
+ def initialize
479
+ super([Revision.new])
480
+ end
481
+ end
482
+
483
+ class ThreadSubject < WikiAvro::XML::Inserter
484
+ def initialize
485
+ super('ThreadSubject', 'threadSubject')
486
+ end
487
+ end
488
+
489
+ class ThreadParent < WikiAvro::XML::Inserter
490
+ def initialize
491
+ super('ThreadParent', 'threadParent')
492
+ end
493
+ end
494
+
495
+ class ThreadAncestor < WikiAvro::XML::Inserter
496
+ def initialize
497
+ super('ThreadAncestor', 'threadAncestor')
498
+ end
499
+ end
500
+
501
+ class ThreadParentGroup < WikiAvro::XML::Group
502
+ def optional?
503
+ true
504
+ end
505
+
506
+ def initialize
507
+ super [{:element => ThreadParent.new, :min => 0, :max => 1},
508
+ {:element => ThreadAncestor.new, :min => 0, :max => 1}]
509
+ end
510
+ end
511
+
512
+ class ThreadPage < WikiAvro::XML::Inserter
513
+ def initialize
514
+ super('ThreadPage', 'threadPage')
515
+ end
516
+ end
517
+
518
+ class ThreadID < WikiAvro::XML::Inserter
519
+ def initialize
520
+ super('ThreadID', 'threadID')
521
+ end
522
+ end
523
+
524
+ class ThreadSummaryPage < WikiAvro::XML::Inserter
525
+ def initialize
526
+ super('ThreadSummaryPage', 'threadSummaryPage')
527
+ end
528
+ end
529
+
530
+ class ThreadSummaryPageGroup < WikiAvro::XML::Group
531
+ def optional?
532
+ true
533
+ end
534
+
535
+ def initialize
536
+ super [{:element => ThreadSummaryPage.new, :min => 0, :max => 1}]
537
+ end
538
+ end
539
+
540
+ class ThreadAuthor < WikiAvro::XML::Inserter
541
+ def initialize
542
+ super('ThreadAuthor', 'threadAuthor')
543
+ end
544
+ end
545
+
546
+ class ThreadEditStatus < WikiAvro::XML::Inserter
547
+ def initialize
548
+ super('ThreadEditStatus', 'threadEditStatus')
549
+ end
550
+ end
551
+
552
+ class ThreadType < WikiAvro::XML::Inserter
553
+ def initialize
554
+ super('ThreadType', 'threadType')
555
+ end
556
+ end
557
+
558
+ class ThreadSignature < WikiAvro::XML::Inserter
559
+ def initialize
560
+ super('ThreadSignature', 'threadSignature')
561
+ end
562
+ end
563
+
564
+ class DiscussionThreading < WikiAvro::XML::Element
565
+ attr_accessor :threadSubject, :threadParent, :threadAncestor,
566
+ :threadPage, :threadID, :threadSummaryPage,
567
+ :threadAuthor, :threadEditStatus, :threadType,
568
+ :threadSignature
569
+ def name
570
+ 'DiscussionThreading'
571
+ end
572
+
573
+ def reset
574
+ threadSubject = nil
575
+ threadParent = nil
576
+ threadAncestor = nil
577
+ threadPage = nil
578
+ threadID = nil
579
+ threadSummaryPage = nil
580
+ threadAuthor = nil
581
+ threadEditStatus = nil
582
+ threadType = nil
583
+ threadSignature = nil
584
+ end
585
+
586
+ def handle_content(w, p, r)
587
+ w.lqt(threadSubject, threadParent, threadAncestor,
588
+ threadPage, threadID, threadSummaryPage,
589
+ threadAuthor, threadEditStatus, threadType,
590
+ threadSignature)
591
+ end
592
+
593
+ def initialize
594
+ super([ThreadSubject.new, ThreadParentGroup.new, ThreadPage.new,
595
+ ThreadID.new, ThreadSummaryPageGroup.new, ThreadAuthor.new,
596
+ ThreadEditStatus.new, ThreadType.new, ThreadSignature.new])
597
+ end
598
+ end
599
+
600
+ class DiscussionThreadingGroup < WikiAvro::XML::Group
601
+ def optional?
602
+ true
603
+ end
604
+
605
+ def initialize
606
+ super [{:element => DiscussionThreading.new, :min => 0, :max => 1}]
607
+ end
608
+ end
609
+
610
+ class Page < WikiAvro::XML::Element
611
+ attr_accessor :title
612
+ attr_accessor :ns
613
+ attr_accessor :id
614
+ attr_accessor :redirect
615
+ attr_accessor :sha1
616
+ attr_accessor :revision_count
617
+
618
+ def name
619
+ 'page'
620
+ end
621
+
622
+ def reset
623
+ title = nil
624
+ ns = nil
625
+ id = nil
626
+ redirect = nil
627
+ sha1 = nil
628
+ revision_count = nil
629
+ @revision_count = 0
630
+ end
631
+
632
+ def handle_content(w, p, r)
633
+ w.page(ns, id, title, redirect, sha1)
634
+ end
635
+
636
+ def initialize
637
+ super([Title.new, Ns.new, Id.new, PageFlags.new,
638
+ RevStream.new, DiscussionThreadingGroup.new,
639
+ RevStream.new])
640
+ end
641
+ end
642
+
643
+ class PageStream < WikiAvro::XML::Stream
644
+ def initialize
645
+ super([Page.new])
646
+ end
647
+ end
648
+
649
+ class WikiDump < WikiAvro::XML::Element
650
+ attr_reader :version
651
+
652
+ def name
653
+ 'mediawiki'
654
+ end
655
+
656
+ protected
657
+
658
+ def parse_attributes(w, p, r)
659
+ @version = r['version']
660
+ warn 'dump version != 0.6' if @version != '0.6'
661
+ end
662
+
663
+ def handle_content(w, p, r)
664
+ w.done
665
+ end
666
+
667
+ def initialize
668
+ super([SiteInfo.new, PageStream.new])
669
+ end
670
+ end
671
+ end
@@ -0,0 +1,282 @@
1
+ # Parser functions will assume to possibly start on their opening tag,
2
+ # and stop parsing right after their end tag. There're probably loads
3
+ # of bugs waiting for when you nest tags of the same name.
4
+
5
+ module WikiAvro::XML
6
+ def self.to_tag(reader)
7
+ # puts 'to_tag: moving to tag'
8
+ loop do
9
+ case reader.node_type
10
+ when XML::Reader::TYPE_ELEMENT
11
+ # puts "to_tag: got tag #{reader.name}"
12
+ return true
13
+ when XML::Reader::TYPE_END_ELEMENT
14
+ # puts "to_tag: got end tag #{reader.name}"
15
+ return false
16
+ end
17
+
18
+ break if !reader.read
19
+ end
20
+
21
+ # XML::Reader will probably raise its own exception before we ever
22
+ # could get here
23
+ raise EOFError.new('no opening tag')
24
+ end
25
+
26
+ # Do not call this while you are on the opening tag
27
+ def self.exit_tag(writer, reader, name)
28
+ nest = 1
29
+
30
+ # puts "exit_tag: exiting #{name}"
31
+
32
+ loop do
33
+ case reader.node_type
34
+ when XML::Reader::TYPE_ELEMENT
35
+ # puts "exit_tag: entered #{reader.name}"
36
+ writer.skipped(reader.name)
37
+ nest += 1 if reader.name == name
38
+ when XML::Reader::TYPE_END_ELEMENT
39
+ # puts "exit_tag: exited #{reader.name}"
40
+ nest -= 1 if reader.name == name
41
+ end
42
+ reader.read
43
+ if nest == 0
44
+ # puts "exit_tag: successful exit, now at #{reader.name}"
45
+ break
46
+ end
47
+ end
48
+ end
49
+
50
+ # Call this to skip when reader is on the opening tag
51
+ def self.skip_tag(writer, reader, skipping)
52
+ nest = 1
53
+ name = reader.name
54
+
55
+ # puts "skip_tag: skipping #{name}"
56
+
57
+ if reader.empty_element?
58
+ # puts "skip_tag: element was empty; skipped"
59
+ writer.skipped(name) if skipping
60
+ reader.read
61
+ return
62
+ end
63
+
64
+ while reader.read
65
+ case reader.node_type
66
+ when XML::Reader::TYPE_ELEMENT
67
+ # puts "skip_tag: entered #{reader.name}"
68
+ writer.skipped(reader.name) if skipping
69
+ nest += 1 if reader.name == name
70
+ when XML::Reader::TYPE_END_ELEMENT
71
+ # puts "skip_tag: exited #{reader.name}"
72
+ nest -= 1 if reader.name == name
73
+ end
74
+ if nest == 0
75
+ reader.read
76
+ break
77
+ end
78
+ end
79
+ end
80
+
81
+ class MissingElement < Exception
82
+ end
83
+
84
+ def self.to_element(writer, reader, name)
85
+ while WikiAvro::XML::to_tag(reader)
86
+ # puts "to_element: saw #{reader.name}"
87
+ if reader.name == name
88
+ return
89
+ else
90
+ # puts "to_element: skipping #{reader.name}"
91
+ WikiAvro::XML.skip_tag(writer, reader, true)
92
+ # puts "to_element: skipped"
93
+ end
94
+
95
+ break if !reader.read
96
+ end
97
+
98
+ raise MissingElement.new(name)
99
+ end
100
+
101
+ class Element
102
+ attr_reader :attr
103
+
104
+ def name
105
+ raise NotImplementedError.new('name')
106
+ end
107
+
108
+ def optional?
109
+ false
110
+ end
111
+
112
+ def parse(output, parent, reader)
113
+ if parent.nil? && reader.name != self.name
114
+ raise RuntimeError.new('reader.name != self.name')
115
+ else
116
+ WikiAvro::XML::to_element(output, reader, self.name)
117
+ end
118
+
119
+ reset
120
+ @attr = parse_attributes(output, parent, reader)
121
+ parse_content(output, parent, reader)
122
+ handle_content(output, parent, reader)
123
+ end
124
+
125
+ protected
126
+
127
+ # Instances will be reused. Subclasses that keep state which needs
128
+ # to be discarded after each parse should implement this.
129
+ def reset
130
+ end
131
+
132
+ def parse_attributes(w, p, r)
133
+ # no attributes parsed
134
+ end
135
+
136
+ # parse_content should move the reader away from the children's
137
+ # parent's opening tag. It should leave reader positioned after
138
+ # the closing tag.
139
+ def parse_content(w, p, r)
140
+ if r.empty_element?
141
+ @children.each do |c|
142
+ raise MissingElement.new(c.name) if !c.optional?
143
+ end
144
+ r.read
145
+ return
146
+ end
147
+
148
+ # Move away from our opening tag
149
+ r.read
150
+ @children.each do |c|
151
+ # puts "element: parsing #{c.class}"
152
+ c.parse(w, self, r)
153
+ # puts "parsed #{c.class}"
154
+ end
155
+
156
+ if r.empty_element? && r.name == self.name
157
+ # puts "got empty: #{r.name}"
158
+ r.read
159
+ # puts "now got this: #{r.name}"
160
+ else
161
+ # puts "mopping up #{self.name}"
162
+ WikiAvro::XML.exit_tag(w, r, self.name)
163
+ end
164
+ end
165
+
166
+ def handle_content(w, p, r)
167
+ # nothing done
168
+ end
169
+
170
+ private
171
+
172
+ def initialize(children)
173
+ @children = children
174
+ end
175
+ end
176
+
177
+ class Leaf < Element
178
+ def initialize
179
+ super([])
180
+ end
181
+ end
182
+
183
+ class Inserter < Leaf
184
+ attr_reader :name
185
+
186
+ def parse_content(w, p, r)
187
+ p.send(@writer, r.read_string)
188
+ # puts "inserter: exiting #{@name}"
189
+ WikiAvro::XML.skip_tag(w, r, false)
190
+ # puts "exited"
191
+ end
192
+
193
+ def initialize(name, target=name)
194
+ super()
195
+ @name = name
196
+ @writer = (target + '=').to_sym
197
+ end
198
+ end
199
+
200
+ class Stream
201
+ def optional?
202
+ true
203
+ end
204
+
205
+ def parse(output, parent, reader)
206
+ # puts "stream: parsing #{self.class}"
207
+
208
+ while WikiAvro::XML::to_tag(reader)
209
+ e = @elements[reader.name]
210
+
211
+ if e.nil?
212
+ # puts "stream: rejected #{reader.name}"
213
+ return
214
+ else
215
+ # puts "stream: accepted #{reader.name}"
216
+ e.parse(output, parent, reader)
217
+ reader.read
218
+ end
219
+ end
220
+
221
+ # puts "stream: ran to parent end"
222
+ end
223
+
224
+ private
225
+
226
+ def initialize(elements)
227
+ @elements = {}
228
+ elements.each { |e| @elements[e.name] = e }
229
+ end
230
+ end
231
+
232
+ class TooManyElements < Exception
233
+ end
234
+
235
+ class TooFewElements < Exception
236
+ end
237
+
238
+ class Group
239
+ # remember to override this if untrue, especially if it might be
240
+ # within an empty element
241
+ def optional?
242
+ false
243
+ end
244
+
245
+ def parse(output, parent, reader)
246
+ @n.keys.each {|k| @n[k] = 0}
247
+
248
+ while WikiAvro::XML::to_tag(reader)
249
+ e = @elements[reader.name]
250
+
251
+ if e.nil?
252
+ @elements.each do |k, v|
253
+ raise TooFewElements.new(k) if @n[k] < v[:min]
254
+ # this ought to be a redundant check
255
+ raise TooManyElements.new(k) if @n[k] > v[:max]
256
+ end
257
+ # puts "group: rejected #{reader.name}"
258
+ return
259
+ else
260
+ # puts "group: accepted #{reader.name}"
261
+ name = reader.name
262
+ @n[name] += 1
263
+ raise TooManyElements.new(name) if @n[name] > e[:max]
264
+ e[:element].parse(output, parent, reader)
265
+ reader.read
266
+ end
267
+ end
268
+
269
+ # puts 'group: ran to parent end'
270
+ end
271
+
272
+ def initialize(elements)
273
+ @elements = {}
274
+ @n = Hash.new 0
275
+ elements.each do |e|
276
+ name = e[:element].name
277
+ @elements[name] = e
278
+ @n[name] = 0
279
+ end
280
+ end
281
+ end
282
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wikiavro
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Someon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: libxml-ruby
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: avro
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.7'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.7'
41
+ description:
42
+ email: someon@openmailbox.org
43
+ executables:
44
+ - wikiavro
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - bin/wikiavro
49
+ - lib/wikiavro.rb
50
+ - lib/wikiavro/avro.rb
51
+ - lib/wikiavro/mediawiki.rb
52
+ - lib/wikiavro/xml.rb
53
+ homepage:
54
+ licenses:
55
+ - GPL-3.0+
56
+ metadata: {}
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ requirements: []
72
+ rubyforge_project:
73
+ rubygems_version: 2.2.2
74
+ signing_key:
75
+ specification_version: 4
76
+ summary: Convert MediaWiki XML dumps to Avro
77
+ test_files: []