infoboxer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.dokaz +1 -0
- data/.yardopts +1 -0
- data/LICENSE.txt +22 -0
- data/Parsing.md +33 -0
- data/README.md +115 -0
- data/examples/output/.gitkeep +0 -0
- data/examples/pages/argentina.wiki +808 -0
- data/examples/to_text.rb +8 -0
- data/examples/tree.rb +8 -0
- data/infoboxer.gemspec +43 -0
- data/lib/infoboxer.rb +196 -0
- data/lib/infoboxer/core_ext.rb +10 -0
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
- data/lib/infoboxer/media_wiki.rb +162 -0
- data/lib/infoboxer/media_wiki/page.rb +38 -0
- data/lib/infoboxer/media_wiki/traits.rb +60 -0
- data/lib/infoboxer/navigation.rb +84 -0
- data/lib/infoboxer/navigation/lookup.rb +216 -0
- data/lib/infoboxer/navigation/sections.rb +179 -0
- data/lib/infoboxer/navigation/selector.rb +59 -0
- data/lib/infoboxer/navigation/shortcuts.rb +165 -0
- data/lib/infoboxer/parser.rb +71 -0
- data/lib/infoboxer/parser/context.rb +165 -0
- data/lib/infoboxer/parser/html.rb +58 -0
- data/lib/infoboxer/parser/image.rb +59 -0
- data/lib/infoboxer/parser/inline.rb +142 -0
- data/lib/infoboxer/parser/paragraphs.rb +66 -0
- data/lib/infoboxer/parser/table.rb +132 -0
- data/lib/infoboxer/parser/template.rb +47 -0
- data/lib/infoboxer/parser/util.rb +73 -0
- data/lib/infoboxer/templates.rb +10 -0
- data/lib/infoboxer/templates/base.rb +82 -0
- data/lib/infoboxer/templates/set.rb +72 -0
- data/lib/infoboxer/tree.rb +70 -0
- data/lib/infoboxer/tree/compound.rb +81 -0
- data/lib/infoboxer/tree/document.rb +11 -0
- data/lib/infoboxer/tree/html.rb +76 -0
- data/lib/infoboxer/tree/image.rb +53 -0
- data/lib/infoboxer/tree/inline.rb +39 -0
- data/lib/infoboxer/tree/list.rb +160 -0
- data/lib/infoboxer/tree/node.rb +181 -0
- data/lib/infoboxer/tree/nodes.rb +185 -0
- data/lib/infoboxer/tree/paragraphs.rb +122 -0
- data/lib/infoboxer/tree/ref.rb +34 -0
- data/lib/infoboxer/tree/table.rb +89 -0
- data/lib/infoboxer/tree/template.rb +82 -0
- data/lib/infoboxer/tree/text.rb +60 -0
- data/lib/infoboxer/tree/wikilink.rb +83 -0
- data/lib/infoboxer/version.rb +4 -0
- data/profile/out/.gitkeep +0 -0
- data/profile/pages/argentina.txt +808 -0
- data/profile/pages/canada.wiki +544 -0
- data/profile/pages/ukraine.wiki +1006 -0
- data/profile/pages/usa.wiki +843 -0
- data/regression/pages/canada.wiki +544 -0
- data/regression/pages/chiang_mai.wiki +2615 -0
- data/regression/pages/south_america.wiki +640 -0
- data/regression/pages/ukraine.wiki +1006 -0
- data/regression/pages/usa.wiki +843 -0
- metadata +272 -0
@@ -0,0 +1,165 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
class Context
|
5
|
+
attr_reader :lineno
|
6
|
+
attr_reader :traits
|
7
|
+
|
8
|
+
def initialize(text, traits = nil)
|
9
|
+
@lines = text.
|
10
|
+
gsub(/<!--.+?-->/m, ''). # FIXME: will also kill comments inside <nowiki> tag
|
11
|
+
split(/[\r\n]/)
|
12
|
+
@lineno = -1
|
13
|
+
@traits = traits || MediaWiki::Traits.default
|
14
|
+
@scanner = StringScanner.new('')
|
15
|
+
next!
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :next_lines
|
19
|
+
|
20
|
+
def colno
|
21
|
+
@scanner && @scanner.pos || 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def matched
|
25
|
+
@matched ||= @scanner && @scanner.matched
|
26
|
+
end
|
27
|
+
|
28
|
+
# check which works only once
|
29
|
+
def eat_matched?(str)
|
30
|
+
return false unless matched == str
|
31
|
+
@matched = 'DUMMY'
|
32
|
+
true
|
33
|
+
end
|
34
|
+
|
35
|
+
def rest
|
36
|
+
@rest ||= @scanner && @scanner.rest
|
37
|
+
end
|
38
|
+
|
39
|
+
alias_method :current, :rest
|
40
|
+
|
41
|
+
# lines navigation
|
42
|
+
def next!
|
43
|
+
shift(+1)
|
44
|
+
end
|
45
|
+
|
46
|
+
def prev!
|
47
|
+
shift(-1)
|
48
|
+
end
|
49
|
+
|
50
|
+
def eof?
|
51
|
+
!next_lines || # we are after the file end
|
52
|
+
next_lines.empty? && eol?
|
53
|
+
end
|
54
|
+
|
55
|
+
def inspect
|
56
|
+
"#<Context(line #{lineno} of #{@lines.count}: #{current})>"
|
57
|
+
end
|
58
|
+
|
59
|
+
# scanning
|
60
|
+
def scan(re)
|
61
|
+
res = @scanner.scan(re)
|
62
|
+
@matched = nil
|
63
|
+
@rest = nil
|
64
|
+
res
|
65
|
+
end
|
66
|
+
|
67
|
+
def check(re)
|
68
|
+
res = @scanner.check(re)
|
69
|
+
@matched = nil
|
70
|
+
@rest = nil
|
71
|
+
res
|
72
|
+
end
|
73
|
+
|
74
|
+
def skip(re)
|
75
|
+
res = @scanner.skip(re)
|
76
|
+
@matched = nil
|
77
|
+
@rest = nil
|
78
|
+
res
|
79
|
+
end
|
80
|
+
|
81
|
+
def scan_until(re, leave_pattern = false)
|
82
|
+
guard_eof!
|
83
|
+
|
84
|
+
res = _scan_until(re)
|
85
|
+
res[matched] = '' if res && !leave_pattern
|
86
|
+
res
|
87
|
+
end
|
88
|
+
|
89
|
+
def inline_eol?(exclude = nil)
|
90
|
+
# not using StringScanner#check, as it will change #matched value
|
91
|
+
eol? ||
|
92
|
+
(current =~ %r[^(</ref>|}})] &&
|
93
|
+
(!exclude || $1 !~ exclude)) # FIXME: ugly, but no idea of prettier solution
|
94
|
+
end
|
95
|
+
|
96
|
+
def scan_continued_until(re, leave_pattern = false)
|
97
|
+
res = ''
|
98
|
+
|
99
|
+
loop do
|
100
|
+
chunk = _scan_until(re)
|
101
|
+
case matched
|
102
|
+
when re
|
103
|
+
res << chunk
|
104
|
+
break
|
105
|
+
when nil
|
106
|
+
res << rest << "\n"
|
107
|
+
next!
|
108
|
+
eof? && fail!("Unfinished scan: #{re} not found")
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
res[/#{re}\Z/] = '' unless leave_pattern
|
113
|
+
res
|
114
|
+
end
|
115
|
+
|
116
|
+
# state inspection
|
117
|
+
def matched_inline?(re)
|
118
|
+
re.nil? ? (matched.empty? && eol?) : matched =~ re
|
119
|
+
end
|
120
|
+
|
121
|
+
def matched?(re)
|
122
|
+
re && matched =~ re
|
123
|
+
end
|
124
|
+
|
125
|
+
def eol?
|
126
|
+
!current || current.empty?
|
127
|
+
end
|
128
|
+
|
129
|
+
# basic services
|
130
|
+
def fail!(text)
|
131
|
+
fail(ParsingError, "#{text} at line #{@lineno}:\n\t#{current}")
|
132
|
+
end
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
# we do hard use of #matched and #rest, its wiser to memoize them
|
137
|
+
def _scan_until(re)
|
138
|
+
res = @scanner.scan_until(re)
|
139
|
+
@matched = nil
|
140
|
+
@rest = nil
|
141
|
+
res
|
142
|
+
end
|
143
|
+
|
144
|
+
def guard_eof!
|
145
|
+
#eof? and fail!("End of input reached")
|
146
|
+
@scanner or fail!("End of input reached")
|
147
|
+
end
|
148
|
+
|
149
|
+
def shift(amount)
|
150
|
+
@lineno += amount
|
151
|
+
current = @lines[lineno]
|
152
|
+
@next_lines = @lines[(lineno+1)..-1]
|
153
|
+
if current
|
154
|
+
@scanner.string = current
|
155
|
+
@rest = current
|
156
|
+
@matched = nil
|
157
|
+
else
|
158
|
+
@scanner = nil
|
159
|
+
@rest = nil
|
160
|
+
@matched = nil
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module HTML
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
def html
|
8
|
+
case
|
9
|
+
when @context.check(/\/[a-z]+>/)
|
10
|
+
html_closing_tag
|
11
|
+
when @context.check(/br\s*>/)
|
12
|
+
html_br
|
13
|
+
when @context.check(%r{[a-z]+[^/>]*/>})
|
14
|
+
html_auto_closing_tag
|
15
|
+
when @context.check(/[a-z]+[^>\/]*>/)
|
16
|
+
html_opening_tag
|
17
|
+
else
|
18
|
+
# not an HTML tag at all!
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def html_closing_tag
|
24
|
+
@context.skip(/\//)
|
25
|
+
tag = @context.scan(/[a-z]+/)
|
26
|
+
@context.skip(/>/)
|
27
|
+
HTMLClosingTag.new(tag)
|
28
|
+
end
|
29
|
+
|
30
|
+
def html_br
|
31
|
+
@context.skip(/br\s*>/)
|
32
|
+
HTMLTag.new('br', {})
|
33
|
+
end
|
34
|
+
|
35
|
+
def html_auto_closing_tag
|
36
|
+
tag = @context.scan(/[a-z]+/)
|
37
|
+
attrs = @context.scan(%r{[^/>]*})
|
38
|
+
@context.skip(%r{/>})
|
39
|
+
HTMLTag.new(tag, parse_params(attrs))
|
40
|
+
end
|
41
|
+
|
42
|
+
def html_opening_tag
|
43
|
+
tag = @context.scan(/[a-z]+/)
|
44
|
+
attrs = @context.scan(/[^>]+/)
|
45
|
+
@context.skip(/>/)
|
46
|
+
contents = short_inline(/<\/#{tag}>/)
|
47
|
+
if @context.matched =~ /<\/#{tag}>/
|
48
|
+
HTMLTag.new(tag, parse_params(attrs), contents)
|
49
|
+
else
|
50
|
+
[
|
51
|
+
HTMLOpeningTag.new(tag, parse_params(attrs)),
|
52
|
+
*contents
|
53
|
+
]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Image
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
def image
|
8
|
+
@context.skip(re.file_prefix) or
|
9
|
+
@context.fail!("Something went wrong: it's not image?")
|
10
|
+
|
11
|
+
path = @context.scan_until(/\||\]\]/)
|
12
|
+
attrs = if @context.matched == '|'
|
13
|
+
image_attrs
|
14
|
+
else
|
15
|
+
{}
|
16
|
+
end
|
17
|
+
Tree::Image.new(path, attrs)
|
18
|
+
end
|
19
|
+
|
20
|
+
def image_attrs
|
21
|
+
nodes = []
|
22
|
+
|
23
|
+
loop do
|
24
|
+
nodes << long_inline(/\||\]\]/)
|
25
|
+
break if @context.matched == ']]'
|
26
|
+
end
|
27
|
+
|
28
|
+
nodes.map(&method(:image_attr)).
|
29
|
+
inject(&:merge).
|
30
|
+
reject{|k, v| v.nil? || v.empty?}
|
31
|
+
end
|
32
|
+
|
33
|
+
def image_attr(nodes)
|
34
|
+
if nodes.count == 1 && nodes.first.is_a?(Text)
|
35
|
+
case (str = nodes.first.text)
|
36
|
+
when /^(thumb)(?:nail)?$/, /^(frame)(?:d)?$/
|
37
|
+
{type: $1}
|
38
|
+
when 'frameless'
|
39
|
+
{type: str}
|
40
|
+
when 'border'
|
41
|
+
{border: str}
|
42
|
+
when /^(baseline|middle|sub|super|text-top|text-bottom|top|bottom)$/
|
43
|
+
{alignment: str}
|
44
|
+
when /^(\d*)(?:x(\d+))?px$/
|
45
|
+
{width: $1, height: $2}
|
46
|
+
when /^link=(.*)$/i
|
47
|
+
{link: $1}
|
48
|
+
when /^alt=(.*)$/i
|
49
|
+
{alt: $1}
|
50
|
+
else # text-only caption
|
51
|
+
{caption: nodes}
|
52
|
+
end
|
53
|
+
else # it's caption, and can have inline markup!
|
54
|
+
{caption: nodes}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Inline
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
def inline(until_pattern = nil)
|
8
|
+
start = @context.lineno
|
9
|
+
nodes = Nodes[]
|
10
|
+
guarded_loop do
|
11
|
+
chunk = @context.scan_until(re.inline_until_cache[until_pattern])
|
12
|
+
nodes << chunk
|
13
|
+
|
14
|
+
break if @context.matched_inline?(until_pattern)
|
15
|
+
|
16
|
+
nodes << inline_formatting(@context.matched) unless @context.matched.empty?
|
17
|
+
|
18
|
+
if @context.eof?
|
19
|
+
break unless until_pattern
|
20
|
+
@context.fail!("#{until_pattern} not found, starting from #{start}")
|
21
|
+
end
|
22
|
+
|
23
|
+
if @context.eol?
|
24
|
+
nodes << "\n"
|
25
|
+
@context.next!
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
nodes
|
30
|
+
end
|
31
|
+
|
32
|
+
def short_inline(until_pattern = nil)
|
33
|
+
nodes = Nodes[]
|
34
|
+
guarded_loop do
|
35
|
+
chunk = @context.scan_until(re.short_inline_until_cache[until_pattern])
|
36
|
+
nodes << chunk
|
37
|
+
|
38
|
+
break if @context.matched_inline?(until_pattern)
|
39
|
+
|
40
|
+
nodes << inline_formatting(@context.matched)
|
41
|
+
|
42
|
+
break if @context.inline_eol?(until_pattern)
|
43
|
+
end
|
44
|
+
|
45
|
+
nodes
|
46
|
+
end
|
47
|
+
|
48
|
+
def long_inline(until_pattern = nil)
|
49
|
+
nodes = Nodes[]
|
50
|
+
guarded_loop do
|
51
|
+
chunk = @context.scan_until(re.inline_until_cache[until_pattern])
|
52
|
+
nodes << chunk
|
53
|
+
|
54
|
+
break if @context.matched?(until_pattern)
|
55
|
+
|
56
|
+
nodes << inline_formatting(@context.matched) unless @context.matched.empty?
|
57
|
+
|
58
|
+
if @context.eof?
|
59
|
+
break unless until_pattern
|
60
|
+
@context.fail!("#{until_pattern} not found")
|
61
|
+
end
|
62
|
+
|
63
|
+
if @context.eol?
|
64
|
+
@context.next!
|
65
|
+
paragraphs(until_pattern).each do |p|
|
66
|
+
nodes << p
|
67
|
+
end
|
68
|
+
break
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
nodes
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
def inline_formatting(match)
|
77
|
+
case match
|
78
|
+
when "'''''"
|
79
|
+
BoldItalic.new(short_inline(/'''''/))
|
80
|
+
when "'''"
|
81
|
+
Bold.new(short_inline(/'''/))
|
82
|
+
when "''"
|
83
|
+
Italic.new(short_inline(/''/))
|
84
|
+
when '[['
|
85
|
+
if @context.check(re.file_prefix)
|
86
|
+
image
|
87
|
+
else
|
88
|
+
wikilink
|
89
|
+
end
|
90
|
+
when /\[(.+)/
|
91
|
+
external_link($1)
|
92
|
+
when '{{'
|
93
|
+
template
|
94
|
+
when /<nowiki([^>]*)>/
|
95
|
+
nowiki
|
96
|
+
when /<ref([^>]*)\/>/
|
97
|
+
reference($1, true)
|
98
|
+
when /<ref([^>]*)>/
|
99
|
+
reference($1)
|
100
|
+
when '<'
|
101
|
+
html || Text.new(match) # it was not HTML, just accidental <
|
102
|
+
else
|
103
|
+
match # FIXME: TEMP
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# http://en.wikipedia.org/wiki/Help:Link#Wikilinks
|
108
|
+
# [[abc]]
|
109
|
+
# [[a|b]]
|
110
|
+
def wikilink
|
111
|
+
link = @context.scan_continued_until(/\||\]\]/)
|
112
|
+
caption = inline(/\]\]/) if @context.matched == '|'
|
113
|
+
Wikilink.new(link, caption)
|
114
|
+
end
|
115
|
+
|
116
|
+
# http://en.wikipedia.org/wiki/Help:Link#External_links
|
117
|
+
# [http://www.example.org]
|
118
|
+
# [http://www.example.org link name]
|
119
|
+
def external_link(protocol)
|
120
|
+
link = @context.scan_continued_until(/\s+|\]/)
|
121
|
+
caption = inline(/\]/) if @context.matched =~ /\s+/
|
122
|
+
ExternalLink.new(protocol + link, caption)
|
123
|
+
end
|
124
|
+
|
125
|
+
def reference(param_str, closed = false)
|
126
|
+
children = closed ? Nodes[] : long_inline(/<\/ref>/)
|
127
|
+
Ref.new(children, parse_params(param_str))
|
128
|
+
end
|
129
|
+
|
130
|
+
def nowiki
|
131
|
+
Text.new(@context.scan_continued_until(/<\/nowiki>/))
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
require_relative 'image'
|
136
|
+
require_relative 'html'
|
137
|
+
require_relative 'template'
|
138
|
+
include Infoboxer::Parser::Image
|
139
|
+
include Infoboxer::Parser::HTML
|
140
|
+
include Infoboxer::Parser::Template
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Paragraphs
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
def paragraphs(until_pattern = nil)
|
8
|
+
nodes = Nodes[]
|
9
|
+
until @context.eof?
|
10
|
+
nodes << paragraph(until_pattern)
|
11
|
+
|
12
|
+
break if until_pattern && @context.matched?(until_pattern)
|
13
|
+
|
14
|
+
@context.next!
|
15
|
+
end
|
16
|
+
nodes.flow_templates
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def paragraph(until_pattern)
|
22
|
+
case @context.current
|
23
|
+
when /^(?<level>={2,})\s*(?<text>.+?)\s*\k<level>$/
|
24
|
+
heading(Regexp.last_match[:text], Regexp.last_match[:level])
|
25
|
+
when /^\s*{\|/
|
26
|
+
table
|
27
|
+
when /^[\*\#:;]./
|
28
|
+
list(until_pattern)
|
29
|
+
when /^-{4,}/
|
30
|
+
HR.new
|
31
|
+
when /^\s*$/
|
32
|
+
# will, when merged, close previous paragraph or add spaces to <pre>
|
33
|
+
EmptyParagraph.new(@context.current)
|
34
|
+
when /^ (?!\s*{{)/ # Lookahead, because spaces before template are ignored
|
35
|
+
pre(until_pattern)
|
36
|
+
else
|
37
|
+
Paragraph.new(short_inline(until_pattern))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def heading(text, level)
|
42
|
+
Heading.new(Parser.inline(text), level.length)
|
43
|
+
end
|
44
|
+
|
45
|
+
# http://en.wikipedia.org/wiki/Help:List
|
46
|
+
def list(until_pattern)
|
47
|
+
marker = @context.scan(/^([*\#:;]+)\s*/).strip
|
48
|
+
List.construct(marker.chars.to_a, short_inline(until_pattern))
|
49
|
+
end
|
50
|
+
|
51
|
+
# FIXME: in fact, there's some formatting, that should work inside pre
|
52
|
+
def pre(until_pattern)
|
53
|
+
@context.skip(/^ /)
|
54
|
+
str = if until_pattern
|
55
|
+
@context.scan_until(/(#{until_pattern}|$)/)
|
56
|
+
else
|
57
|
+
@context.current
|
58
|
+
end
|
59
|
+
Pre.new(Nodes[Text.new(str)])
|
60
|
+
end
|
61
|
+
|
62
|
+
require_relative 'table'
|
63
|
+
include Parser::Table
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|