infoboxer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.dokaz +1 -0
- data/.yardopts +1 -0
- data/LICENSE.txt +22 -0
- data/Parsing.md +33 -0
- data/README.md +115 -0
- data/examples/output/.gitkeep +0 -0
- data/examples/pages/argentina.wiki +808 -0
- data/examples/to_text.rb +8 -0
- data/examples/tree.rb +8 -0
- data/infoboxer.gemspec +43 -0
- data/lib/infoboxer.rb +196 -0
- data/lib/infoboxer/core_ext.rb +10 -0
- data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
- data/lib/infoboxer/media_wiki.rb +162 -0
- data/lib/infoboxer/media_wiki/page.rb +38 -0
- data/lib/infoboxer/media_wiki/traits.rb +60 -0
- data/lib/infoboxer/navigation.rb +84 -0
- data/lib/infoboxer/navigation/lookup.rb +216 -0
- data/lib/infoboxer/navigation/sections.rb +179 -0
- data/lib/infoboxer/navigation/selector.rb +59 -0
- data/lib/infoboxer/navigation/shortcuts.rb +165 -0
- data/lib/infoboxer/parser.rb +71 -0
- data/lib/infoboxer/parser/context.rb +165 -0
- data/lib/infoboxer/parser/html.rb +58 -0
- data/lib/infoboxer/parser/image.rb +59 -0
- data/lib/infoboxer/parser/inline.rb +142 -0
- data/lib/infoboxer/parser/paragraphs.rb +66 -0
- data/lib/infoboxer/parser/table.rb +132 -0
- data/lib/infoboxer/parser/template.rb +47 -0
- data/lib/infoboxer/parser/util.rb +73 -0
- data/lib/infoboxer/templates.rb +10 -0
- data/lib/infoboxer/templates/base.rb +82 -0
- data/lib/infoboxer/templates/set.rb +72 -0
- data/lib/infoboxer/tree.rb +70 -0
- data/lib/infoboxer/tree/compound.rb +81 -0
- data/lib/infoboxer/tree/document.rb +11 -0
- data/lib/infoboxer/tree/html.rb +76 -0
- data/lib/infoboxer/tree/image.rb +53 -0
- data/lib/infoboxer/tree/inline.rb +39 -0
- data/lib/infoboxer/tree/list.rb +160 -0
- data/lib/infoboxer/tree/node.rb +181 -0
- data/lib/infoboxer/tree/nodes.rb +185 -0
- data/lib/infoboxer/tree/paragraphs.rb +122 -0
- data/lib/infoboxer/tree/ref.rb +34 -0
- data/lib/infoboxer/tree/table.rb +89 -0
- data/lib/infoboxer/tree/template.rb +82 -0
- data/lib/infoboxer/tree/text.rb +60 -0
- data/lib/infoboxer/tree/wikilink.rb +83 -0
- data/lib/infoboxer/version.rb +4 -0
- data/profile/out/.gitkeep +0 -0
- data/profile/pages/argentina.txt +808 -0
- data/profile/pages/canada.wiki +544 -0
- data/profile/pages/ukraine.wiki +1006 -0
- data/profile/pages/usa.wiki +843 -0
- data/regression/pages/canada.wiki +544 -0
- data/regression/pages/chiang_mai.wiki +2615 -0
- data/regression/pages/south_america.wiki +640 -0
- data/regression/pages/ukraine.wiki +1006 -0
- data/regression/pages/usa.wiki +843 -0
- metadata +272 -0
@@ -0,0 +1,165 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
class Context
|
5
|
+
attr_reader :lineno
|
6
|
+
attr_reader :traits
|
7
|
+
|
8
|
+
def initialize(text, traits = nil)
|
9
|
+
@lines = text.
|
10
|
+
gsub(/<!--.+?-->/m, ''). # FIXME: will also kill comments inside <nowiki> tag
|
11
|
+
split(/[\r\n]/)
|
12
|
+
@lineno = -1
|
13
|
+
@traits = traits || MediaWiki::Traits.default
|
14
|
+
@scanner = StringScanner.new('')
|
15
|
+
next!
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :next_lines
|
19
|
+
|
20
|
+
def colno
|
21
|
+
@scanner && @scanner.pos || 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def matched
|
25
|
+
@matched ||= @scanner && @scanner.matched
|
26
|
+
end
|
27
|
+
|
28
|
+
# check which works only once
|
29
|
+
def eat_matched?(str)
|
30
|
+
return false unless matched == str
|
31
|
+
@matched = 'DUMMY'
|
32
|
+
true
|
33
|
+
end
|
34
|
+
|
35
|
+
def rest
|
36
|
+
@rest ||= @scanner && @scanner.rest
|
37
|
+
end
|
38
|
+
|
39
|
+
alias_method :current, :rest
|
40
|
+
|
41
|
+
# lines navigation
|
42
|
+
def next!
|
43
|
+
shift(+1)
|
44
|
+
end
|
45
|
+
|
46
|
+
def prev!
|
47
|
+
shift(-1)
|
48
|
+
end
|
49
|
+
|
50
|
+
def eof?
|
51
|
+
!next_lines || # we are after the file end
|
52
|
+
next_lines.empty? && eol?
|
53
|
+
end
|
54
|
+
|
55
|
+
def inspect
|
56
|
+
"#<Context(line #{lineno} of #{@lines.count}: #{current})>"
|
57
|
+
end
|
58
|
+
|
59
|
+
# scanning
|
60
|
+
def scan(re)
|
61
|
+
res = @scanner.scan(re)
|
62
|
+
@matched = nil
|
63
|
+
@rest = nil
|
64
|
+
res
|
65
|
+
end
|
66
|
+
|
67
|
+
def check(re)
|
68
|
+
res = @scanner.check(re)
|
69
|
+
@matched = nil
|
70
|
+
@rest = nil
|
71
|
+
res
|
72
|
+
end
|
73
|
+
|
74
|
+
def skip(re)
|
75
|
+
res = @scanner.skip(re)
|
76
|
+
@matched = nil
|
77
|
+
@rest = nil
|
78
|
+
res
|
79
|
+
end
|
80
|
+
|
81
|
+
def scan_until(re, leave_pattern = false)
|
82
|
+
guard_eof!
|
83
|
+
|
84
|
+
res = _scan_until(re)
|
85
|
+
res[matched] = '' if res && !leave_pattern
|
86
|
+
res
|
87
|
+
end
|
88
|
+
|
89
|
+
def inline_eol?(exclude = nil)
|
90
|
+
# not using StringScanner#check, as it will change #matched value
|
91
|
+
eol? ||
|
92
|
+
(current =~ %r[^(</ref>|}})] &&
|
93
|
+
(!exclude || $1 !~ exclude)) # FIXME: ugly, but no idea of prettier solution
|
94
|
+
end
|
95
|
+
|
96
|
+
def scan_continued_until(re, leave_pattern = false)
|
97
|
+
res = ''
|
98
|
+
|
99
|
+
loop do
|
100
|
+
chunk = _scan_until(re)
|
101
|
+
case matched
|
102
|
+
when re
|
103
|
+
res << chunk
|
104
|
+
break
|
105
|
+
when nil
|
106
|
+
res << rest << "\n"
|
107
|
+
next!
|
108
|
+
eof? && fail!("Unfinished scan: #{re} not found")
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
res[/#{re}\Z/] = '' unless leave_pattern
|
113
|
+
res
|
114
|
+
end
|
115
|
+
|
116
|
+
# state inspection
|
117
|
+
def matched_inline?(re)
|
118
|
+
re.nil? ? (matched.empty? && eol?) : matched =~ re
|
119
|
+
end
|
120
|
+
|
121
|
+
def matched?(re)
|
122
|
+
re && matched =~ re
|
123
|
+
end
|
124
|
+
|
125
|
+
def eol?
|
126
|
+
!current || current.empty?
|
127
|
+
end
|
128
|
+
|
129
|
+
# basic services
|
130
|
+
def fail!(text)
|
131
|
+
fail(ParsingError, "#{text} at line #{@lineno}:\n\t#{current}")
|
132
|
+
end
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
# we do hard use of #matched and #rest, its wiser to memoize them
|
137
|
+
def _scan_until(re)
|
138
|
+
res = @scanner.scan_until(re)
|
139
|
+
@matched = nil
|
140
|
+
@rest = nil
|
141
|
+
res
|
142
|
+
end
|
143
|
+
|
144
|
+
def guard_eof!
|
145
|
+
#eof? and fail!("End of input reached")
|
146
|
+
@scanner or fail!("End of input reached")
|
147
|
+
end
|
148
|
+
|
149
|
+
def shift(amount)
|
150
|
+
@lineno += amount
|
151
|
+
current = @lines[lineno]
|
152
|
+
@next_lines = @lines[(lineno+1)..-1]
|
153
|
+
if current
|
154
|
+
@scanner.string = current
|
155
|
+
@rest = current
|
156
|
+
@matched = nil
|
157
|
+
else
|
158
|
+
@scanner = nil
|
159
|
+
@rest = nil
|
160
|
+
@matched = nil
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module HTML
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
def html
|
8
|
+
case
|
9
|
+
when @context.check(/\/[a-z]+>/)
|
10
|
+
html_closing_tag
|
11
|
+
when @context.check(/br\s*>/)
|
12
|
+
html_br
|
13
|
+
when @context.check(%r{[a-z]+[^/>]*/>})
|
14
|
+
html_auto_closing_tag
|
15
|
+
when @context.check(/[a-z]+[^>\/]*>/)
|
16
|
+
html_opening_tag
|
17
|
+
else
|
18
|
+
# not an HTML tag at all!
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def html_closing_tag
|
24
|
+
@context.skip(/\//)
|
25
|
+
tag = @context.scan(/[a-z]+/)
|
26
|
+
@context.skip(/>/)
|
27
|
+
HTMLClosingTag.new(tag)
|
28
|
+
end
|
29
|
+
|
30
|
+
def html_br
|
31
|
+
@context.skip(/br\s*>/)
|
32
|
+
HTMLTag.new('br', {})
|
33
|
+
end
|
34
|
+
|
35
|
+
def html_auto_closing_tag
|
36
|
+
tag = @context.scan(/[a-z]+/)
|
37
|
+
attrs = @context.scan(%r{[^/>]*})
|
38
|
+
@context.skip(%r{/>})
|
39
|
+
HTMLTag.new(tag, parse_params(attrs))
|
40
|
+
end
|
41
|
+
|
42
|
+
def html_opening_tag
|
43
|
+
tag = @context.scan(/[a-z]+/)
|
44
|
+
attrs = @context.scan(/[^>]+/)
|
45
|
+
@context.skip(/>/)
|
46
|
+
contents = short_inline(/<\/#{tag}>/)
|
47
|
+
if @context.matched =~ /<\/#{tag}>/
|
48
|
+
HTMLTag.new(tag, parse_params(attrs), contents)
|
49
|
+
else
|
50
|
+
[
|
51
|
+
HTMLOpeningTag.new(tag, parse_params(attrs)),
|
52
|
+
*contents
|
53
|
+
]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Image
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
def image
|
8
|
+
@context.skip(re.file_prefix) or
|
9
|
+
@context.fail!("Something went wrong: it's not image?")
|
10
|
+
|
11
|
+
path = @context.scan_until(/\||\]\]/)
|
12
|
+
attrs = if @context.matched == '|'
|
13
|
+
image_attrs
|
14
|
+
else
|
15
|
+
{}
|
16
|
+
end
|
17
|
+
Tree::Image.new(path, attrs)
|
18
|
+
end
|
19
|
+
|
20
|
+
def image_attrs
|
21
|
+
nodes = []
|
22
|
+
|
23
|
+
loop do
|
24
|
+
nodes << long_inline(/\||\]\]/)
|
25
|
+
break if @context.matched == ']]'
|
26
|
+
end
|
27
|
+
|
28
|
+
nodes.map(&method(:image_attr)).
|
29
|
+
inject(&:merge).
|
30
|
+
reject{|k, v| v.nil? || v.empty?}
|
31
|
+
end
|
32
|
+
|
33
|
+
def image_attr(nodes)
|
34
|
+
if nodes.count == 1 && nodes.first.is_a?(Text)
|
35
|
+
case (str = nodes.first.text)
|
36
|
+
when /^(thumb)(?:nail)?$/, /^(frame)(?:d)?$/
|
37
|
+
{type: $1}
|
38
|
+
when 'frameless'
|
39
|
+
{type: str}
|
40
|
+
when 'border'
|
41
|
+
{border: str}
|
42
|
+
when /^(baseline|middle|sub|super|text-top|text-bottom|top|bottom)$/
|
43
|
+
{alignment: str}
|
44
|
+
when /^(\d*)(?:x(\d+))?px$/
|
45
|
+
{width: $1, height: $2}
|
46
|
+
when /^link=(.*)$/i
|
47
|
+
{link: $1}
|
48
|
+
when /^alt=(.*)$/i
|
49
|
+
{alt: $1}
|
50
|
+
else # text-only caption
|
51
|
+
{caption: nodes}
|
52
|
+
end
|
53
|
+
else # it's caption, and can have inline markup!
|
54
|
+
{caption: nodes}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Inline
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
def inline(until_pattern = nil)
|
8
|
+
start = @context.lineno
|
9
|
+
nodes = Nodes[]
|
10
|
+
guarded_loop do
|
11
|
+
chunk = @context.scan_until(re.inline_until_cache[until_pattern])
|
12
|
+
nodes << chunk
|
13
|
+
|
14
|
+
break if @context.matched_inline?(until_pattern)
|
15
|
+
|
16
|
+
nodes << inline_formatting(@context.matched) unless @context.matched.empty?
|
17
|
+
|
18
|
+
if @context.eof?
|
19
|
+
break unless until_pattern
|
20
|
+
@context.fail!("#{until_pattern} not found, starting from #{start}")
|
21
|
+
end
|
22
|
+
|
23
|
+
if @context.eol?
|
24
|
+
nodes << "\n"
|
25
|
+
@context.next!
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
nodes
|
30
|
+
end
|
31
|
+
|
32
|
+
def short_inline(until_pattern = nil)
|
33
|
+
nodes = Nodes[]
|
34
|
+
guarded_loop do
|
35
|
+
chunk = @context.scan_until(re.short_inline_until_cache[until_pattern])
|
36
|
+
nodes << chunk
|
37
|
+
|
38
|
+
break if @context.matched_inline?(until_pattern)
|
39
|
+
|
40
|
+
nodes << inline_formatting(@context.matched)
|
41
|
+
|
42
|
+
break if @context.inline_eol?(until_pattern)
|
43
|
+
end
|
44
|
+
|
45
|
+
nodes
|
46
|
+
end
|
47
|
+
|
48
|
+
def long_inline(until_pattern = nil)
|
49
|
+
nodes = Nodes[]
|
50
|
+
guarded_loop do
|
51
|
+
chunk = @context.scan_until(re.inline_until_cache[until_pattern])
|
52
|
+
nodes << chunk
|
53
|
+
|
54
|
+
break if @context.matched?(until_pattern)
|
55
|
+
|
56
|
+
nodes << inline_formatting(@context.matched) unless @context.matched.empty?
|
57
|
+
|
58
|
+
if @context.eof?
|
59
|
+
break unless until_pattern
|
60
|
+
@context.fail!("#{until_pattern} not found")
|
61
|
+
end
|
62
|
+
|
63
|
+
if @context.eol?
|
64
|
+
@context.next!
|
65
|
+
paragraphs(until_pattern).each do |p|
|
66
|
+
nodes << p
|
67
|
+
end
|
68
|
+
break
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
nodes
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
def inline_formatting(match)
|
77
|
+
case match
|
78
|
+
when "'''''"
|
79
|
+
BoldItalic.new(short_inline(/'''''/))
|
80
|
+
when "'''"
|
81
|
+
Bold.new(short_inline(/'''/))
|
82
|
+
when "''"
|
83
|
+
Italic.new(short_inline(/''/))
|
84
|
+
when '[['
|
85
|
+
if @context.check(re.file_prefix)
|
86
|
+
image
|
87
|
+
else
|
88
|
+
wikilink
|
89
|
+
end
|
90
|
+
when /\[(.+)/
|
91
|
+
external_link($1)
|
92
|
+
when '{{'
|
93
|
+
template
|
94
|
+
when /<nowiki([^>]*)>/
|
95
|
+
nowiki
|
96
|
+
when /<ref([^>]*)\/>/
|
97
|
+
reference($1, true)
|
98
|
+
when /<ref([^>]*)>/
|
99
|
+
reference($1)
|
100
|
+
when '<'
|
101
|
+
html || Text.new(match) # it was not HTML, just accidental <
|
102
|
+
else
|
103
|
+
match # FIXME: TEMP
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# http://en.wikipedia.org/wiki/Help:Link#Wikilinks
|
108
|
+
# [[abc]]
|
109
|
+
# [[a|b]]
|
110
|
+
def wikilink
|
111
|
+
link = @context.scan_continued_until(/\||\]\]/)
|
112
|
+
caption = inline(/\]\]/) if @context.matched == '|'
|
113
|
+
Wikilink.new(link, caption)
|
114
|
+
end
|
115
|
+
|
116
|
+
# http://en.wikipedia.org/wiki/Help:Link#External_links
|
117
|
+
# [http://www.example.org]
|
118
|
+
# [http://www.example.org link name]
|
119
|
+
def external_link(protocol)
|
120
|
+
link = @context.scan_continued_until(/\s+|\]/)
|
121
|
+
caption = inline(/\]/) if @context.matched =~ /\s+/
|
122
|
+
ExternalLink.new(protocol + link, caption)
|
123
|
+
end
|
124
|
+
|
125
|
+
def reference(param_str, closed = false)
|
126
|
+
children = closed ? Nodes[] : long_inline(/<\/ref>/)
|
127
|
+
Ref.new(children, parse_params(param_str))
|
128
|
+
end
|
129
|
+
|
130
|
+
def nowiki
|
131
|
+
Text.new(@context.scan_continued_until(/<\/nowiki>/))
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
require_relative 'image'
|
136
|
+
require_relative 'html'
|
137
|
+
require_relative 'template'
|
138
|
+
include Infoboxer::Parser::Image
|
139
|
+
include Infoboxer::Parser::HTML
|
140
|
+
include Infoboxer::Parser::Template
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Infoboxer
|
3
|
+
class Parser
|
4
|
+
module Paragraphs
|
5
|
+
include Tree
|
6
|
+
|
7
|
+
def paragraphs(until_pattern = nil)
|
8
|
+
nodes = Nodes[]
|
9
|
+
until @context.eof?
|
10
|
+
nodes << paragraph(until_pattern)
|
11
|
+
|
12
|
+
break if until_pattern && @context.matched?(until_pattern)
|
13
|
+
|
14
|
+
@context.next!
|
15
|
+
end
|
16
|
+
nodes.flow_templates
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def paragraph(until_pattern)
|
22
|
+
case @context.current
|
23
|
+
when /^(?<level>={2,})\s*(?<text>.+?)\s*\k<level>$/
|
24
|
+
heading(Regexp.last_match[:text], Regexp.last_match[:level])
|
25
|
+
when /^\s*{\|/
|
26
|
+
table
|
27
|
+
when /^[\*\#:;]./
|
28
|
+
list(until_pattern)
|
29
|
+
when /^-{4,}/
|
30
|
+
HR.new
|
31
|
+
when /^\s*$/
|
32
|
+
# will, when merged, close previous paragraph or add spaces to <pre>
|
33
|
+
EmptyParagraph.new(@context.current)
|
34
|
+
when /^ (?!\s*{{)/ # Lookahead, because spaces before template are ignored
|
35
|
+
pre(until_pattern)
|
36
|
+
else
|
37
|
+
Paragraph.new(short_inline(until_pattern))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def heading(text, level)
|
42
|
+
Heading.new(Parser.inline(text), level.length)
|
43
|
+
end
|
44
|
+
|
45
|
+
# http://en.wikipedia.org/wiki/Help:List
|
46
|
+
def list(until_pattern)
|
47
|
+
marker = @context.scan(/^([*\#:;]+)\s*/).strip
|
48
|
+
List.construct(marker.chars.to_a, short_inline(until_pattern))
|
49
|
+
end
|
50
|
+
|
51
|
+
# FIXME: in fact, there's some formatting, that should work inside pre
|
52
|
+
def pre(until_pattern)
|
53
|
+
@context.skip(/^ /)
|
54
|
+
str = if until_pattern
|
55
|
+
@context.scan_until(/(#{until_pattern}|$)/)
|
56
|
+
else
|
57
|
+
@context.current
|
58
|
+
end
|
59
|
+
Pre.new(Nodes[Text.new(str)])
|
60
|
+
end
|
61
|
+
|
62
|
+
require_relative 'table'
|
63
|
+
include Parser::Table
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|