wikiavro 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c64ef5f767535d9432d8f589411f76dedb6a4b04
4
+ data.tar.gz: b846af138fe797fa09e9f248bcdb3298e988b5fb
5
+ SHA512:
6
+ metadata.gz: b5f7cba59679bc913ceaddf95e97f8ed7c63d9f54660d81680db419da09abf2c61ddadd2f2a75b0e199fbfe9f710363edcd889550f76f44acdd6a34e591f69a2
7
+ data.tar.gz: 8b63a6ae13ce31cc31eb2f5698a0bd586c9645dfbce1be6ae52e45f90315af3015ed1e1f17badaa4f06425483369a369af6adefd0dae2c8ef0555d8f05193f01
data/bin/wikiavro ADDED
@@ -0,0 +1,90 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'wikiavro'
4
+ require 'xml'
5
+ require 'optparse'
6
+
7
+ Version = WikiAvro::Version
8
+
9
+ logger = WikiAvro::MediaWiki::RevisionProgress.new 10000
10
+ overwrite = false
11
+ deflate = false
12
+ ns = nil
13
+ page = nil
14
+ rev = nil
15
+ lqt = nil
16
+
17
+ opt_parser = OptionParser.new do |opts|
18
+ opts.banner = 'Usage: wikiavro [options] [INFILE]'
19
+
20
+ opts.on('-q', '--quiet', "Don't report progress") do |v|
21
+ logger = WikiAvro::MediaWiki::NoProgress.new
22
+ end
23
+
24
+ opts.on('-v LEVEL', '--verbose=LEVEL', Integer,
25
+ 'Report progress after every LEVEL revisions parsed') do |lvl|
26
+ logger = WikiAvro::MediaWiki::RevisionProgress.new lvl.to_i
27
+ end
28
+
29
+ opts.on('-o', '--overwrite') do |o|
30
+ overwrite = o
31
+ end
32
+
33
+ opts.on('-d', '--deflate', 'Enable Avro internal compression') do |d|
34
+ deflate = d
35
+ end
36
+
37
+ opts.on('-n OUTFILE', '--namespaces=OUTFILE') do |path|
38
+ ns = path
39
+ end
40
+
41
+ opts.on('-p OUTFILE', '--pages=OUTFILE') do |path|
42
+ page = path
43
+ end
44
+
45
+ opts.on('-r OUTFILE', '--revisions=OUTFILE') do |path|
46
+ rev = path
47
+ end
48
+
49
+ opts.on('-l OUTFILE', '--liquidthreads=OUTFILE') do |path|
50
+ lqt = path
51
+ end
52
+ end
53
+
54
+ opt_parser.parse!
55
+
56
+ abort('You must specify --namespaces') if ns.nil?
57
+ abort('You must specify --pages') if page.nil?
58
+ abort('You must specify --revisions') if rev.nil?
59
+ abort('You must specify --liquidthreads') if lqt.nil?
60
+
61
+ if !overwrite
62
+ [ns, page, rev, lqt].each do |path|
63
+ if File.exists? path
64
+ abort("#{path} already exists! pass --overwrite to proceed anyway")
65
+ end
66
+ end
67
+ end
68
+
69
+ ns = WikiAvro::Avro::NamespaceWriter.new ns, deflate
70
+ page = WikiAvro::Avro::PageWriter.new page, deflate
71
+ rev = WikiAvro::Avro::RevisionWriter.new rev, deflate
72
+ lqt = WikiAvro::Avro::LqtWriter.new lqt, deflate
73
+
74
+ writer = WikiAvro::MediaWiki::WikiWriter.new :logger => logger,
75
+ :namespace => ns,
76
+ :page => page,
77
+ :revision => rev,
78
+ :lqt => lqt
79
+
80
+ mw = WikiAvro::MediaWiki::WikiDump.new
81
+
82
+ xml = XML::Reader.io(ARGF)
83
+ xml.read
84
+
85
+ mw.parse(writer, nil, xml)
86
+
87
+ ns.close
88
+ page.close
89
+ rev.close
90
+ lqt.close
data/lib/wikiavro.rb ADDED
@@ -0,0 +1,6 @@
1
+ module WikiAvro
2
+ Version = '0.0.1'
3
+ end
4
+
5
+ require 'wikiavro/mediawiki'
6
+ require 'wikiavro/avro'
@@ -0,0 +1,199 @@
1
+ require 'avro'
2
+
3
+ module WikiAvro::Avro
4
+ NAMESPACE_SCHEMA = <<-EOS
5
+ {
6
+ "namespace": "org.rationalwiki",
7
+ "name": "Namespace",
8
+ "type": "record",
9
+ "fields": [
10
+ {"name": "key", "type": "int"},
11
+ {"name": "case", "type": "string"},
12
+ {"name": "name", "type": "string"}
13
+ ]
14
+ }
15
+ EOS
16
+
17
+ PAGE_SCHEMA = <<-EOS
18
+ {
19
+ "namespace": "org.rationalwiki",
20
+ "name": "Page",
21
+ "type": "record",
22
+ "fields": [
23
+ {"name": "id", "type": "long"},
24
+ {"name": "ns", "type": "long"},
25
+ {"name": "title", "type": "string"},
26
+ {"name": "redirect", "type": ["null", "string"]},
27
+ {"name": "sha1", "type": ["null", "string"]}
28
+ ]
29
+ }
30
+ EOS
31
+
32
+ REVISION_SCHEMA = <<-EOS
33
+ {
34
+ "namespace": "org.rationalwiki",
35
+ "name": "Revision",
36
+ "type": "record",
37
+ "fields": [
38
+ {"name": "id", "type": "long"},
39
+ {"name": "page_id", "type": "long"},
40
+ {"name": "n", "type": "long"},
41
+ {"name": "timestamp", "type": "string"},
42
+ {"name": "contributor", "type": ["null", {
43
+ "namespace": "org.rationalwiki",
44
+ "name": "Contributor",
45
+ "type": "record",
46
+ "fields": [
47
+ {"name": "id", "type": ["null", "long"]},
48
+ {"name": "username", "type": ["null", "string"]},
49
+ {"name": "ip", "type": ["null", "string"]}
50
+ ]
51
+ }]},
52
+ {"name": "minor", "type": "boolean"},
53
+ {"name": "comment", "type": ["null", "string"]},
54
+ {"name": "bytes", "type": "long"},
55
+ {"name": "textid", "type": ["null", "string"]},
56
+ {"name": "text", "type": ["null", "string"]}
57
+ ]
58
+ }
59
+ EOS
60
+
61
+ LQT_SCHEMA = <<-EOS
62
+ {
63
+ "namespace": "org.rationalwiki",
64
+ "name": "Threading",
65
+ "type": "record",
66
+ "fields": [
67
+ {"name": "subject", "type": "string"},
68
+ {"name": "parent", "type": ["null", "long"]},
69
+ {"name": "ancestor", "type": ["null", "long"]},
70
+ {"name": "page", "type": "string"},
71
+ {"name": "id", "type": "long"},
72
+ {"name": "summary_page", "type": ["null", "string"]},
73
+ {"name": "author", "type": "string"},
74
+ {"name": "edit_status", "type": "string"},
75
+ {"name": "type", "type": "string"},
76
+ {"name": "signature", "type": ["null", "string"]}
77
+ ]
78
+ }
79
+ EOS
80
+
81
+ class AvroWriter
82
+ def schema
83
+ raise NotImplementedError
84
+ end
85
+
86
+ def close
87
+ @writer.close
88
+ end
89
+
90
+ protected
91
+
92
+ def encode(data)
93
+ @writer << data
94
+ end
95
+
96
+ def initialize(path, deflate=false)
97
+ if !deflate
98
+ @writer = Avro::DataFile.open(path, 'w', schema)
99
+ else
100
+ @writer = Avro::DataFile.open(path, 'w', schema, 'deflate')
101
+ end
102
+ end
103
+ end
104
+
105
+ class NamespaceWriter < AvroWriter
106
+ def schema
107
+ NAMESPACE_SCHEMA
108
+ end
109
+
110
+ def write(key, casetype, name)
111
+ encode 'key' => key.to_i,
112
+ 'case' => casetype,
113
+ 'name' => name || ''
114
+ end
115
+ end
116
+
117
+ class PageWriter < AvroWriter
118
+ def schema
119
+ PAGE_SCHEMA
120
+ end
121
+
122
+ def write(ns, id, title, redirect, sha1)
123
+ encode 'id' => id.to_i,
124
+ 'ns' => ns.to_i,
125
+ 'title' => title,
126
+ 'redirect' => redirect,
127
+ 'sha1' => sha1
128
+ end
129
+ end
130
+
131
+ class RevisionWriter < AvroWriter
132
+ def schema
133
+ REVISION_SCHEMA
134
+ end
135
+
136
+ def write(id, page_id, n, timestamp, contributor, minor,
137
+ comment, text_deleted, bytes, textid, text)
138
+ if !contributor[:deleted].nil? && !(contributor[:id].nil? &&
139
+ contributor[:username].nil? &&
140
+ contributor[:ip].nil?)
141
+ raise 'deleted contributor has content'
142
+ end
143
+
144
+ if contributor[:deleted].nil?
145
+ contributor.delete :deleted
146
+ contributor = {
147
+ 'username' => contributor[:username],
148
+ 'id' => contributor[:id].to_i,
149
+ 'ip' => contributor[:ip]
150
+ }
151
+ else
152
+ contributor = nil
153
+ end
154
+
155
+ if comment[:deleted].nil?
156
+ comment = comment[:comment]
157
+ else
158
+ raise 'deleted comment has content' if comment[:comment]
159
+ comment = nil
160
+ end
161
+
162
+ text = nil if !text_deleted.nil?
163
+
164
+ encode 'id' => id.to_i,
165
+ 'page_id' => page_id.to_i,
166
+ 'n' => n.to_i,
167
+ 'timestamp' => timestamp,
168
+ 'contributor' => contributor,
169
+ 'minor' => minor.nil?,
170
+ 'comment' => comment,
171
+ 'bytes' => bytes.to_i,
172
+ 'textid' => textid,
173
+ 'text' => text
174
+ end
175
+ end
176
+
177
+ class LqtWriter < AvroWriter
178
+ def schema
179
+ LQT_SCHEMA
180
+ end
181
+
182
+ def write(subject, parent, ancestor, page, id, summary_page,
183
+ author, edit_status, type, signature)
184
+ parent = parent.to_i if parent
185
+ ancestor = ancestor.to_i if ancestor
186
+
187
+ encode 'subject' => subject,
188
+ 'parent' => parent,
189
+ 'ancestor' => ancestor,
190
+ 'page' => page,
191
+ 'id' => id.to_i,
192
+ 'summary_page' => summary_page,
193
+ 'author' => author,
194
+ 'edit_status' => edit_status,
195
+ 'type' => type,
196
+ 'signature' => signature
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,671 @@
1
+ require 'wikiavro/xml'
2
+
3
+ # RW declares schema 0.6 but has redirects as in 0.7
4
+ #
5
+ # RW has <sha1> tags right after <redirect> - they are indented to a
6
+ # different level too. The <sha1>s are missing within <revision>s
7
+ # where they should be.
8
+ #
9
+ # Schema claims discussionthreadinginfo, but actual tag is
10
+ # DiscussionThreading. Schema does not describe ThreadSummaryPage or
11
+ # ThreadSignature. Schema does not describe which LQT tags are
12
+ # omissible. Schema says thread info should always come after
13
+ # revisions, but it does not.
14
+
15
+ module WikiAvro::MediaWiki
16
+ class NamespacePrinter
17
+ def write(key, casetype, name)
18
+ puts "namespace #{key}: \"#{name}\" #{casetype}"
19
+ end
20
+ end
21
+
22
+ class PagePrinter
23
+ def write(ns, id, title, redirect, sha1)
24
+ puts "page \"#{title}\": #{id} #{ns} #{redirect} #{sha1}"
25
+ end
26
+ end
27
+
28
+ class RevisionPrinter
29
+ def write(id, page_id, n, timestamp, contributor, minor,
30
+ comment, text_deleted, bytes, textid, text)
31
+ puts "rev #{page_id} #{n}: #{timestamp} " +
32
+ "#{bytes} #{contributor[:username]}"
33
+ end
34
+ end
35
+
36
+ class LqtPrinter
37
+ def write(threadSubject, threadParent, threadAncestor,
38
+ threadPage, threadID, threadSummaryPage,
39
+ threadAuthor, threadEditStatus, threadType,
40
+ threadSignature)
41
+ puts "thread #{threadSubject} #{threadParent} #{threadAncestor} " +
42
+ "#{threadAuthor} #{threadEditStatus} #{threadType}"
43
+ end
44
+ end
45
+
46
+ class NullWriter
47
+ def method_missing(target, *args, &block)
48
+ # All methods return nil
49
+ end
50
+
51
+ def initialize
52
+ end
53
+ end
54
+
55
+ class WikiWriter
56
+ def initialize(writers)
57
+ null_writer = NullWriter.new
58
+ @namespace = (writers[:namespace] or NamespacePrinter.new)
59
+ @logger = (writers[:logger] or NoProgress.new)
60
+ @page = (writers[:page] or NullWriter.new)
61
+ @revision = (writers[:revision] or NullWriter.new)
62
+ @lqt = (writers[:lqt] or NullWriter.new)
63
+ end
64
+
65
+ def namespace(key, casetype, name)
66
+ @namespace.write(key, casetype, name)
67
+ end
68
+
69
+ def page(ns, id, title, redirect, sha1)
70
+ @logger.report_pages(1)
71
+ @page.write(ns, id, title, redirect, sha1)
72
+ end
73
+
74
+ def revision(id, page_id, n, timestamp, contributor, minor,
75
+ comment, text_deleted, bytes, textid, text)
76
+ @logger.report_revisions(1)
77
+ @revision.write(id, page_id, n, timestamp, contributor, minor,
78
+ comment, text_deleted, bytes, textid, text)
79
+ end
80
+
81
+ def lqt(threadSubject, threadParent, threadAncestor,
82
+ threadPage, threadID, threadSummaryPage,
83
+ threadAuthor, threadEditStatus, threadType,
84
+ threadSignature)
85
+ @lqt.write(threadSubject, threadParent, threadAncestor,
86
+ threadPage, threadID, threadSummaryPage,
87
+ threadAuthor, threadEditStatus, threadType,
88
+ threadSignature)
89
+ end
90
+
91
+ def done
92
+ @logger.report_done
93
+ end
94
+
95
+ def skipped(name)
96
+ # puts "wikiwriter: skipped element #{name}"
97
+ # raise 'what?'
98
+ @logger.report_skipped_element(name)
99
+ end
100
+ end
101
+
102
+ class NoProgress
103
+ def report_pages(n)
104
+ end
105
+
106
+ def report_revisions(n)
107
+ end
108
+
109
+ def report_done
110
+ end
111
+
112
+ def report_skipped_element(name)
113
+ end
114
+ end
115
+
116
+ class FinalProgress
117
+ def f(n)
118
+ parts = []
119
+ while n >= 1
120
+ parts.unshift(n % 1000)
121
+ n /= 1000
122
+ end
123
+ head = parts.shift
124
+ if parts.empty?
125
+ if head
126
+ head.to_s
127
+ else
128
+ n
129
+ end
130
+ else
131
+ [head, parts.map {|p| sprintf('%03d', p)}.join(',')].join(',')
132
+ end
133
+ end
134
+
135
+ def total_skipped
136
+ @skipped_counts.values.reduce(0, :+)
137
+ end
138
+
139
+ def report_pages(n)
140
+ @pages += n
141
+ end
142
+
143
+ def report_revisions(n)
144
+ @revisions += n
145
+ end
146
+
147
+ def report_skipped_element(name)
148
+ @skipped_counts[name] += 1
149
+ end
150
+
151
+ def show_skipped
152
+ @skipped_counts.each do |name, count|
153
+ puts "#{name}: #{count}"
154
+ end
155
+ end
156
+
157
+ def report_done
158
+ duration = Time.now - @start_time
159
+ avg_rate = @revisions / duration
160
+ h = (duration / 60 / 60).floor
161
+ m = (duration % (60 * 60) / 60).floor
162
+ s = (duration % 60).floor
163
+ # FIXME: Print to STDERR or some log
164
+ skipped = total_skipped
165
+ if skipped > 0
166
+ puts "Couldn't process #{skipped} elements! Detailed breakdown:"
167
+ show_skipped
168
+ end
169
+ puts "Done! Took #{h}h#{m}m#{s}s. Averaged #{f avg_rate.round(0)} rps."
170
+ end
171
+
172
+ def initialize
173
+ @start_time = Time.now
174
+ @pages = 0
175
+ @revisions = 0
176
+ @skipped_counts = Hash.new 0
177
+ end
178
+ end
179
+
180
+ class RevisionProgress < FinalProgress
181
+ def announce_progress
182
+ now = Time.now
183
+ rps = (@revisions - @previous_revisions) / (now - @previous_time)
184
+ puts "Page #{f @pages}, rev #{f @revisions} (#{f rps.round(0)} rps)"
185
+ skipped = total_skipped
186
+ puts "#{f skipped} unprocessable elements so far."
187
+ show_skipped
188
+ @previous_time = now
189
+ @previous_revisions = @revisions
190
+ end
191
+
192
+ def report_revisions(n)
193
+ super(n)
194
+
195
+ if @revisions - @previous_revisions >= @interval
196
+ announce_progress
197
+ end
198
+ end
199
+
200
+ def report_done
201
+ announce_progress
202
+ super
203
+ end
204
+
205
+ def initialize(interval)
206
+ super()
207
+ @interval = interval
208
+ @previous_time = @start_time
209
+ @previous_revisions = 0
210
+ end
211
+ end
212
+
213
+ class Namespace < WikiAvro::XML::Leaf
214
+ def name
215
+ 'namespace'
216
+ end
217
+
218
+ def reset
219
+ # everything is overwritten each cycle anyway
220
+ end
221
+
222
+ def parse_attributes(w, p, r)
223
+ @key = r['key']
224
+ @case = r['case']
225
+ end
226
+
227
+ def parse_content(w, p, r)
228
+ name = r.read_string
229
+ WikiAvro::XML.skip_tag(w, r, false)
230
+ w.namespace(@key, @case, name)
231
+ end
232
+ end
233
+
234
+ class NamespaceStream < WikiAvro::XML::Stream
235
+ def initialize
236
+ super([Namespace.new])
237
+ end
238
+ end
239
+
240
+ class Sitename < WikiAvro::XML::Inserter
241
+ def initialize
242
+ super('sitename')
243
+ end
244
+ end
245
+
246
+ class Base < WikiAvro::XML::Inserter
247
+ def initialize
248
+ super('base')
249
+ end
250
+ end
251
+
252
+ class Generator < WikiAvro::XML::Inserter
253
+ def initialize
254
+ super('generator')
255
+ end
256
+ end
257
+
258
+ class Case < WikiAvro::XML::Inserter
259
+ def initialize
260
+ super('case')
261
+ end
262
+ end
263
+
264
+ class Namespaces < WikiAvro::XML::Element
265
+ def name
266
+ 'namespaces'
267
+ end
268
+
269
+ def initialize
270
+ super([NamespaceStream.new])
271
+ end
272
+ end
273
+
274
+ class SiteInfo < WikiAvro::XML::Element
275
+ attr_accessor :sitename
276
+ attr_accessor :base
277
+ attr_accessor :generator
278
+ attr_accessor :case
279
+
280
+ def name
281
+ 'siteinfo'
282
+ end
283
+
284
+ def reset
285
+ @sitename = nil
286
+ @base = nil
287
+ @generator = nil
288
+ @case = nil
289
+ end
290
+
291
+ def initialize
292
+ super([Sitename.new, Base.new, Generator.new,
293
+ Case.new, Namespaces.new])
294
+ end
295
+ end
296
+
297
+ class Title < WikiAvro::XML::Inserter
298
+ def initialize
299
+ super('title')
300
+ end
301
+ end
302
+
303
+ class Ns < WikiAvro::XML::Inserter
304
+ def initialize
305
+ super('ns')
306
+ end
307
+ end
308
+
309
+ class Id < WikiAvro::XML::Inserter
310
+ def initialize
311
+ super('id')
312
+ end
313
+ end
314
+
315
+ class Redirect < WikiAvro::XML::Leaf
316
+ def name
317
+ 'redirect'
318
+ end
319
+
320
+ def parse_attributes(w, p, r)
321
+ # puts "redirect: #{r['title']}"
322
+ p.redirect = r['title']
323
+ end
324
+ end
325
+
326
+ class Sha1 < WikiAvro::XML::Inserter
327
+ def initialize
328
+ super('sha1')
329
+ end
330
+ end
331
+
332
+ class PageFlags < WikiAvro::XML::Group
333
+ def initialize
334
+ super [{:element => Redirect.new, :min => 0, :max => 1},
335
+ {:element => Sha1.new, :min => 0, :max => 1}]
336
+ end
337
+ end
338
+
339
+ class Timestamp < WikiAvro::XML::Inserter
340
+ def initialize
341
+ super('timestamp')
342
+ end
343
+ end
344
+
345
+ class Username < WikiAvro::XML::Inserter
346
+ def initialize
347
+ super('username')
348
+ end
349
+ end
350
+
351
+ class Ip < WikiAvro::XML::Inserter
352
+ def initialize
353
+ super('ip')
354
+ end
355
+ end
356
+
357
+ class ContributorGroup < WikiAvro::XML::Group
358
+ def optional?
359
+ true
360
+ end
361
+
362
+ def initialize
363
+ super [{:element => Username.new, :min => 0, :max => 1},
364
+ {:element => Id.new, :min => 0, :max => 1},
365
+ {:element => Ip.new, :min => 0, :max => 1}]
366
+ end
367
+ end
368
+
369
+ class Contributor < WikiAvro::XML::Element
370
+ def name
371
+ 'contributor'
372
+ end
373
+
374
+ attr_accessor :id
375
+ attr_accessor :username
376
+ attr_accessor :ip
377
+
378
+ def reset
379
+ @id = nil
380
+ @username = nil
381
+ @ip = nil
382
+ @deleted = nil
383
+ end
384
+
385
+ def parse_attributes(w, p, r)
386
+ @deleted = r['deleted']
387
+ end
388
+
389
+ def handle_content(w, p, r)
390
+ p.contributor = {:deleted => @deleted, :id => id,
391
+ :username => username, :ip => ip}
392
+ end
393
+
394
+ def initialize
395
+ super([ContributorGroup.new])
396
+ end
397
+ end
398
+
399
+ class Minor < WikiAvro::XML::Inserter
400
+ def initialize
401
+ super('minor')
402
+ end
403
+ end
404
+
405
+ class Comment < WikiAvro::XML::Leaf
406
+ def name
407
+ 'comment'
408
+ end
409
+
410
+ def parse_attributes(w, p, r)
411
+ deleted = r['deleted']
412
+ comment = r.read_string
413
+ p.comment = {:deleted => deleted,
414
+ :comment => comment}
415
+ end
416
+ end
417
+
418
+ class RevisionFlags < WikiAvro::XML::Group
419
+ def initialize
420
+ super [{:element => Minor.new, :min => 0, :max => 1},
421
+ {:element => Comment.new, :min => 0, :max => 1}]
422
+ end
423
+ end
424
+
425
+ class Text < WikiAvro::XML::Inserter
426
+ def parse_attributes(w, p, r)
427
+ p.text_deleted = r['deleted']
428
+ p.textid = r['id']
429
+ p.bytes = r['bytes']
430
+ end
431
+
432
+ def initialize
433
+ super('text')
434
+ end
435
+ end
436
+
437
+ class Revision < WikiAvro::XML::Element
438
+ attr_accessor :id
439
+ attr_accessor :timestamp
440
+ attr_accessor :contributor
441
+ attr_accessor :minor
442
+ attr_accessor :comment
443
+ attr_accessor :text_deleted
444
+ attr_accessor :bytes
445
+ attr_accessor :textid
446
+ attr_accessor :text
447
+
448
+ def name
449
+ 'revision'
450
+ end
451
+
452
+ def reset
453
+ id = nil
454
+ timestamp = nil
455
+ contributor = nil
456
+ minor = nil
457
+ comment = nil
458
+ text_deleted = nil
459
+ bytes = nil
460
+ textid = nil
461
+ text = nil
462
+ end
463
+
464
+ def handle_content(w, p, r)
465
+ p.revision_count += 1
466
+ n = p.revision_count
467
+ w.revision(id, p.id, n, timestamp, contributor, minor,
468
+ comment, text_deleted, bytes, textid, text)
469
+ end
470
+
471
+ def initialize
472
+ super([Id.new, Timestamp.new, Contributor.new,
473
+ RevisionFlags.new, Text.new])
474
+ end
475
+ end
476
+
477
+ class RevStream < WikiAvro::XML::Stream
478
+ def initialize
479
+ super([Revision.new])
480
+ end
481
+ end
482
+
483
+ class ThreadSubject < WikiAvro::XML::Inserter
484
+ def initialize
485
+ super('ThreadSubject', 'threadSubject')
486
+ end
487
+ end
488
+
489
+ class ThreadParent < WikiAvro::XML::Inserter
490
+ def initialize
491
+ super('ThreadParent', 'threadParent')
492
+ end
493
+ end
494
+
495
+ class ThreadAncestor < WikiAvro::XML::Inserter
496
+ def initialize
497
+ super('ThreadAncestor', 'threadAncestor')
498
+ end
499
+ end
500
+
501
+ class ThreadParentGroup < WikiAvro::XML::Group
502
+ def optional?
503
+ true
504
+ end
505
+
506
+ def initialize
507
+ super [{:element => ThreadParent.new, :min => 0, :max => 1},
508
+ {:element => ThreadAncestor.new, :min => 0, :max => 1}]
509
+ end
510
+ end
511
+
512
+ class ThreadPage < WikiAvro::XML::Inserter
513
+ def initialize
514
+ super('ThreadPage', 'threadPage')
515
+ end
516
+ end
517
+
518
+ class ThreadID < WikiAvro::XML::Inserter
519
+ def initialize
520
+ super('ThreadID', 'threadID')
521
+ end
522
+ end
523
+
524
+ class ThreadSummaryPage < WikiAvro::XML::Inserter
525
+ def initialize
526
+ super('ThreadSummaryPage', 'threadSummaryPage')
527
+ end
528
+ end
529
+
530
+ class ThreadSummaryPageGroup < WikiAvro::XML::Group
531
+ def optional?
532
+ true
533
+ end
534
+
535
+ def initialize
536
+ super [{:element => ThreadSummaryPage.new, :min => 0, :max => 1}]
537
+ end
538
+ end
539
+
540
+ class ThreadAuthor < WikiAvro::XML::Inserter
541
+ def initialize
542
+ super('ThreadAuthor', 'threadAuthor')
543
+ end
544
+ end
545
+
546
+ class ThreadEditStatus < WikiAvro::XML::Inserter
547
+ def initialize
548
+ super('ThreadEditStatus', 'threadEditStatus')
549
+ end
550
+ end
551
+
552
+ class ThreadType < WikiAvro::XML::Inserter
553
+ def initialize
554
+ super('ThreadType', 'threadType')
555
+ end
556
+ end
557
+
558
+ class ThreadSignature < WikiAvro::XML::Inserter
559
+ def initialize
560
+ super('ThreadSignature', 'threadSignature')
561
+ end
562
+ end
563
+
564
+ class DiscussionThreading < WikiAvro::XML::Element
565
+ attr_accessor :threadSubject, :threadParent, :threadAncestor,
566
+ :threadPage, :threadID, :threadSummaryPage,
567
+ :threadAuthor, :threadEditStatus, :threadType,
568
+ :threadSignature
569
+ def name
570
+ 'DiscussionThreading'
571
+ end
572
+
573
+ def reset
574
+ threadSubject = nil
575
+ threadParent = nil
576
+ threadAncestor = nil
577
+ threadPage = nil
578
+ threadID = nil
579
+ threadSummaryPage = nil
580
+ threadAuthor = nil
581
+ threadEditStatus = nil
582
+ threadType = nil
583
+ threadSignature = nil
584
+ end
585
+
586
+ def handle_content(w, p, r)
587
+ w.lqt(threadSubject, threadParent, threadAncestor,
588
+ threadPage, threadID, threadSummaryPage,
589
+ threadAuthor, threadEditStatus, threadType,
590
+ threadSignature)
591
+ end
592
+
593
+ def initialize
594
+ super([ThreadSubject.new, ThreadParentGroup.new, ThreadPage.new,
595
+ ThreadID.new, ThreadSummaryPageGroup.new, ThreadAuthor.new,
596
+ ThreadEditStatus.new, ThreadType.new, ThreadSignature.new])
597
+ end
598
+ end
599
+
600
+ class DiscussionThreadingGroup < WikiAvro::XML::Group
601
+ def optional?
602
+ true
603
+ end
604
+
605
+ def initialize
606
+ super [{:element => DiscussionThreading.new, :min => 0, :max => 1}]
607
+ end
608
+ end
609
+
610
+ class Page < WikiAvro::XML::Element
611
+ attr_accessor :title
612
+ attr_accessor :ns
613
+ attr_accessor :id
614
+ attr_accessor :redirect
615
+ attr_accessor :sha1
616
+ attr_accessor :revision_count
617
+
618
+ def name
619
+ 'page'
620
+ end
621
+
622
+ def reset
623
+ title = nil
624
+ ns = nil
625
+ id = nil
626
+ redirect = nil
627
+ sha1 = nil
628
+ revision_count = nil
629
+ @revision_count = 0
630
+ end
631
+
632
+ def handle_content(w, p, r)
633
+ w.page(ns, id, title, redirect, sha1)
634
+ end
635
+
636
+ def initialize
637
+ super([Title.new, Ns.new, Id.new, PageFlags.new,
638
+ RevStream.new, DiscussionThreadingGroup.new,
639
+ RevStream.new])
640
+ end
641
+ end
642
+
643
+ class PageStream < WikiAvro::XML::Stream
644
+ def initialize
645
+ super([Page.new])
646
+ end
647
+ end
648
+
649
+ class WikiDump < WikiAvro::XML::Element
650
+ attr_reader :version
651
+
652
+ def name
653
+ 'mediawiki'
654
+ end
655
+
656
+ protected
657
+
658
+ def parse_attributes(w, p, r)
659
+ @version = r['version']
660
+ warn 'dump version != 0.6' if @version != '0.6'
661
+ end
662
+
663
+ def handle_content(w, p, r)
664
+ w.done
665
+ end
666
+
667
+ def initialize
668
+ super([SiteInfo.new, PageStream.new])
669
+ end
670
+ end
671
+ end
@@ -0,0 +1,282 @@
1
+ # Parser functions will assume to possibly start on their opening tag,
2
+ # and stop parsing right after their end tag. There're probably loads
3
+ # of bugs waiting for when you nest tags of the same name.
4
+
5
+ module WikiAvro::XML
6
+ def self.to_tag(reader)
7
+ # puts 'to_tag: moving to tag'
8
+ loop do
9
+ case reader.node_type
10
+ when XML::Reader::TYPE_ELEMENT
11
+ # puts "to_tag: got tag #{reader.name}"
12
+ return true
13
+ when XML::Reader::TYPE_END_ELEMENT
14
+ # puts "to_tag: got end tag #{reader.name}"
15
+ return false
16
+ end
17
+
18
+ break if !reader.read
19
+ end
20
+
21
+ # XML::Reader will probably raise its own exception before we ever
22
+ # could get here
23
+ raise EOFError.new('no opening tag')
24
+ end
25
+
26
+ # Do not call this while you are on the opening tag
27
+ def self.exit_tag(writer, reader, name)
28
+ nest = 1
29
+
30
+ # puts "exit_tag: exiting #{name}"
31
+
32
+ loop do
33
+ case reader.node_type
34
+ when XML::Reader::TYPE_ELEMENT
35
+ # puts "exit_tag: entered #{reader.name}"
36
+ writer.skipped(reader.name)
37
+ nest += 1 if reader.name == name
38
+ when XML::Reader::TYPE_END_ELEMENT
39
+ # puts "exit_tag: exited #{reader.name}"
40
+ nest -= 1 if reader.name == name
41
+ end
42
+ reader.read
43
+ if nest == 0
44
+ # puts "exit_tag: successful exit, now at #{reader.name}"
45
+ break
46
+ end
47
+ end
48
+ end
49
+
50
+ # Call this to skip when reader is on the opening tag
51
+ def self.skip_tag(writer, reader, skipping)
52
+ nest = 1
53
+ name = reader.name
54
+
55
+ # puts "skip_tag: skipping #{name}"
56
+
57
+ if reader.empty_element?
58
+ # puts "skip_tag: element was empty; skipped"
59
+ writer.skipped(name) if skipping
60
+ reader.read
61
+ return
62
+ end
63
+
64
+ while reader.read
65
+ case reader.node_type
66
+ when XML::Reader::TYPE_ELEMENT
67
+ # puts "skip_tag: entered #{reader.name}"
68
+ writer.skipped(reader.name) if skipping
69
+ nest += 1 if reader.name == name
70
+ when XML::Reader::TYPE_END_ELEMENT
71
+ # puts "skip_tag: exited #{reader.name}"
72
+ nest -= 1 if reader.name == name
73
+ end
74
+ if nest == 0
75
+ reader.read
76
+ break
77
+ end
78
+ end
79
+ end
80
+
81
+ class MissingElement < Exception
82
+ end
83
+
84
+ def self.to_element(writer, reader, name)
85
+ while WikiAvro::XML::to_tag(reader)
86
+ # puts "to_element: saw #{reader.name}"
87
+ if reader.name == name
88
+ return
89
+ else
90
+ # puts "to_element: skipping #{reader.name}"
91
+ WikiAvro::XML.skip_tag(writer, reader, true)
92
+ # puts "to_element: skipped"
93
+ end
94
+
95
+ break if !reader.read
96
+ end
97
+
98
+ raise MissingElement.new(name)
99
+ end
100
+
101
+ class Element
102
+ attr_reader :attr
103
+
104
+ def name
105
+ raise NotImplementedError.new('name')
106
+ end
107
+
108
+ def optional?
109
+ false
110
+ end
111
+
112
+ def parse(output, parent, reader)
113
+ if parent.nil? && reader.name != self.name
114
+ raise RuntimeError.new('reader.name != self.name')
115
+ else
116
+ WikiAvro::XML::to_element(output, reader, self.name)
117
+ end
118
+
119
+ reset
120
+ @attr = parse_attributes(output, parent, reader)
121
+ parse_content(output, parent, reader)
122
+ handle_content(output, parent, reader)
123
+ end
124
+
125
+ protected
126
+
127
+ # Instances will be reused. Subclasses that keep state which needs
128
+ # to be discarded after each parse should implement this.
129
+ def reset
130
+ end
131
+
132
+ def parse_attributes(w, p, r)
133
+ # no attributes parsed
134
+ end
135
+
136
+ # parse_content should move the reader away from the children's
137
+ # parent's opening tag. It should leave reader positioned after
138
+ # the closing tag.
139
+ def parse_content(w, p, r)
140
+ if r.empty_element?
141
+ @children.each do |c|
142
+ raise MissingElement.new(c.name) if !c.optional?
143
+ end
144
+ r.read
145
+ return
146
+ end
147
+
148
+ # Move away from our opening tag
149
+ r.read
150
+ @children.each do |c|
151
+ # puts "element: parsing #{c.class}"
152
+ c.parse(w, self, r)
153
+ # puts "parsed #{c.class}"
154
+ end
155
+
156
+ if r.empty_element? && r.name == self.name
157
+ # puts "got empty: #{r.name}"
158
+ r.read
159
+ # puts "now got this: #{r.name}"
160
+ else
161
+ # puts "mopping up #{self.name}"
162
+ WikiAvro::XML.exit_tag(w, r, self.name)
163
+ end
164
+ end
165
+
166
+ def handle_content(w, p, r)
167
+ # nothing done
168
+ end
169
+
170
+ private
171
+
172
+ def initialize(children)
173
+ @children = children
174
+ end
175
+ end
176
+
177
+ class Leaf < Element
178
+ def initialize
179
+ super([])
180
+ end
181
+ end
182
+
183
+ class Inserter < Leaf
184
+ attr_reader :name
185
+
186
+ def parse_content(w, p, r)
187
+ p.send(@writer, r.read_string)
188
+ # puts "inserter: exiting #{@name}"
189
+ WikiAvro::XML.skip_tag(w, r, false)
190
+ # puts "exited"
191
+ end
192
+
193
+ def initialize(name, target=name)
194
+ super()
195
+ @name = name
196
+ @writer = (target + '=').to_sym
197
+ end
198
+ end
199
+
200
+ class Stream
201
+ def optional?
202
+ true
203
+ end
204
+
205
+ def parse(output, parent, reader)
206
+ # puts "stream: parsing #{self.class}"
207
+
208
+ while WikiAvro::XML::to_tag(reader)
209
+ e = @elements[reader.name]
210
+
211
+ if e.nil?
212
+ # puts "stream: rejected #{reader.name}"
213
+ return
214
+ else
215
+ # puts "stream: accepted #{reader.name}"
216
+ e.parse(output, parent, reader)
217
+ reader.read
218
+ end
219
+ end
220
+
221
+ # puts "stream: ran to parent end"
222
+ end
223
+
224
+ private
225
+
226
+ def initialize(elements)
227
+ @elements = {}
228
+ elements.each { |e| @elements[e.name] = e }
229
+ end
230
+ end
231
+
232
+ class TooManyElements < Exception
233
+ end
234
+
235
+ class TooFewElements < Exception
236
+ end
237
+
238
+ class Group
239
+ # remember to override this if untrue, especially if it might be
240
+ # within an empty element
241
+ def optional?
242
+ false
243
+ end
244
+
245
+ def parse(output, parent, reader)
246
+ @n.keys.each {|k| @n[k] = 0}
247
+
248
+ while WikiAvro::XML::to_tag(reader)
249
+ e = @elements[reader.name]
250
+
251
+ if e.nil?
252
+ @elements.each do |k, v|
253
+ raise TooFewElements.new(k) if @n[k] < v[:min]
254
+ # this ought to be a redundant check
255
+ raise TooManyElements.new(k) if @n[k] > v[:max]
256
+ end
257
+ # puts "group: rejected #{reader.name}"
258
+ return
259
+ else
260
+ # puts "group: accepted #{reader.name}"
261
+ name = reader.name
262
+ @n[name] += 1
263
+ raise TooManyElements.new(name) if @n[name] > e[:max]
264
+ e[:element].parse(output, parent, reader)
265
+ reader.read
266
+ end
267
+ end
268
+
269
+ # puts 'group: ran to parent end'
270
+ end
271
+
272
+ def initialize(elements)
273
+ @elements = {}
274
+ @n = Hash.new 0
275
+ elements.each do |e|
276
+ name = e[:element].name
277
+ @elements[name] = e
278
+ @n[name] = 0
279
+ end
280
+ end
281
+ end
282
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wikiavro
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Someon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: libxml-ruby
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: avro
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.7'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.7'
41
+ description:
42
+ email: someon@openmailbox.org
43
+ executables:
44
+ - wikiavro
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - bin/wikiavro
49
+ - lib/wikiavro.rb
50
+ - lib/wikiavro/avro.rb
51
+ - lib/wikiavro/mediawiki.rb
52
+ - lib/wikiavro/xml.rb
53
+ homepage:
54
+ licenses:
55
+ - GPL-3.0+
56
+ metadata: {}
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ requirements: []
72
+ rubyforge_project:
73
+ rubygems_version: 2.2.2
74
+ signing_key:
75
+ specification_version: 4
76
+ summary: Convert MediaWiki XML dumps to Avro
77
+ test_files: []