infoboxer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,165 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ class Context
5
+ attr_reader :lineno
6
+ attr_reader :traits
7
+
8
+ def initialize(text, traits = nil)
9
+ @lines = text.
10
+ gsub(/<!--.+?-->/m, ''). # FIXME: will also kill comments inside <nowiki> tag
11
+ split(/[\r\n]/)
12
+ @lineno = -1
13
+ @traits = traits || MediaWiki::Traits.default
14
+ @scanner = StringScanner.new('')
15
+ next!
16
+ end
17
+
18
+ attr_reader :next_lines
19
+
20
+ def colno
21
+ @scanner && @scanner.pos || 0
22
+ end
23
+
24
+ def matched
25
+ @matched ||= @scanner && @scanner.matched
26
+ end
27
+
28
+ # check which works only once
29
+ def eat_matched?(str)
30
+ return false unless matched == str
31
+ @matched = 'DUMMY'
32
+ true
33
+ end
34
+
35
+ def rest
36
+ @rest ||= @scanner && @scanner.rest
37
+ end
38
+
39
+ alias_method :current, :rest
40
+
41
+ # lines navigation
42
+ def next!
43
+ shift(+1)
44
+ end
45
+
46
+ def prev!
47
+ shift(-1)
48
+ end
49
+
50
+ def eof?
51
+ !next_lines || # we are after the file end
52
+ next_lines.empty? && eol?
53
+ end
54
+
55
+ def inspect
56
+ "#<Context(line #{lineno} of #{@lines.count}: #{current})>"
57
+ end
58
+
59
+ # scanning
60
+ def scan(re)
61
+ res = @scanner.scan(re)
62
+ @matched = nil
63
+ @rest = nil
64
+ res
65
+ end
66
+
67
+ def check(re)
68
+ res = @scanner.check(re)
69
+ @matched = nil
70
+ @rest = nil
71
+ res
72
+ end
73
+
74
+ def skip(re)
75
+ res = @scanner.skip(re)
76
+ @matched = nil
77
+ @rest = nil
78
+ res
79
+ end
80
+
81
+ def scan_until(re, leave_pattern = false)
82
+ guard_eof!
83
+
84
+ res = _scan_until(re)
85
+ res[matched] = '' if res && !leave_pattern
86
+ res
87
+ end
88
+
89
+ def inline_eol?(exclude = nil)
90
+ # not using StringScanner#check, as it will change #matched value
91
+ eol? ||
92
+ (current =~ %r[^(</ref>|}})] &&
93
+ (!exclude || $1 !~ exclude)) # FIXME: ugly, but no idea of prettier solution
94
+ end
95
+
96
+ def scan_continued_until(re, leave_pattern = false)
97
+ res = ''
98
+
99
+ loop do
100
+ chunk = _scan_until(re)
101
+ case matched
102
+ when re
103
+ res << chunk
104
+ break
105
+ when nil
106
+ res << rest << "\n"
107
+ next!
108
+ eof? && fail!("Unfinished scan: #{re} not found")
109
+ end
110
+ end
111
+
112
+ res[/#{re}\Z/] = '' unless leave_pattern
113
+ res
114
+ end
115
+
116
+ # state inspection
117
+ def matched_inline?(re)
118
+ re.nil? ? (matched.empty? && eol?) : matched =~ re
119
+ end
120
+
121
+ def matched?(re)
122
+ re && matched =~ re
123
+ end
124
+
125
+ def eol?
126
+ !current || current.empty?
127
+ end
128
+
129
+ # basic services
130
+ def fail!(text)
131
+ fail(ParsingError, "#{text} at line #{@lineno}:\n\t#{current}")
132
+ end
133
+
134
+ private
135
+
136
+ # we do hard use of #matched and #rest, its wiser to memoize them
137
+ def _scan_until(re)
138
+ res = @scanner.scan_until(re)
139
+ @matched = nil
140
+ @rest = nil
141
+ res
142
+ end
143
+
144
+ def guard_eof!
145
+ #eof? and fail!("End of input reached")
146
+ @scanner or fail!("End of input reached")
147
+ end
148
+
149
+ def shift(amount)
150
+ @lineno += amount
151
+ current = @lines[lineno]
152
+ @next_lines = @lines[(lineno+1)..-1]
153
+ if current
154
+ @scanner.string = current
155
+ @rest = current
156
+ @matched = nil
157
+ else
158
+ @scanner = nil
159
+ @rest = nil
160
+ @matched = nil
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,58 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module HTML
5
+ include Tree
6
+
7
+ def html
8
+ case
9
+ when @context.check(/\/[a-z]+>/)
10
+ html_closing_tag
11
+ when @context.check(/br\s*>/)
12
+ html_br
13
+ when @context.check(%r{[a-z]+[^/>]*/>})
14
+ html_auto_closing_tag
15
+ when @context.check(/[a-z]+[^>\/]*>/)
16
+ html_opening_tag
17
+ else
18
+ # not an HTML tag at all!
19
+ nil
20
+ end
21
+ end
22
+
23
+ def html_closing_tag
24
+ @context.skip(/\//)
25
+ tag = @context.scan(/[a-z]+/)
26
+ @context.skip(/>/)
27
+ HTMLClosingTag.new(tag)
28
+ end
29
+
30
+ def html_br
31
+ @context.skip(/br\s*>/)
32
+ HTMLTag.new('br', {})
33
+ end
34
+
35
+ def html_auto_closing_tag
36
+ tag = @context.scan(/[a-z]+/)
37
+ attrs = @context.scan(%r{[^/>]*})
38
+ @context.skip(%r{/>})
39
+ HTMLTag.new(tag, parse_params(attrs))
40
+ end
41
+
42
+ def html_opening_tag
43
+ tag = @context.scan(/[a-z]+/)
44
+ attrs = @context.scan(/[^>]+/)
45
+ @context.skip(/>/)
46
+ contents = short_inline(/<\/#{tag}>/)
47
+ if @context.matched =~ /<\/#{tag}>/
48
+ HTMLTag.new(tag, parse_params(attrs), contents)
49
+ else
50
+ [
51
+ HTMLOpeningTag.new(tag, parse_params(attrs)),
52
+ *contents
53
+ ]
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,59 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Image
5
+ include Tree
6
+
7
+ def image
8
+ @context.skip(re.file_prefix) or
9
+ @context.fail!("Something went wrong: it's not image?")
10
+
11
+ path = @context.scan_until(/\||\]\]/)
12
+ attrs = if @context.matched == '|'
13
+ image_attrs
14
+ else
15
+ {}
16
+ end
17
+ Tree::Image.new(path, attrs)
18
+ end
19
+
20
+ def image_attrs
21
+ nodes = []
22
+
23
+ loop do
24
+ nodes << long_inline(/\||\]\]/)
25
+ break if @context.matched == ']]'
26
+ end
27
+
28
+ nodes.map(&method(:image_attr)).
29
+ inject(&:merge).
30
+ reject{|k, v| v.nil? || v.empty?}
31
+ end
32
+
33
+ def image_attr(nodes)
34
+ if nodes.count == 1 && nodes.first.is_a?(Text)
35
+ case (str = nodes.first.text)
36
+ when /^(thumb)(?:nail)?$/, /^(frame)(?:d)?$/
37
+ {type: $1}
38
+ when 'frameless'
39
+ {type: str}
40
+ when 'border'
41
+ {border: str}
42
+ when /^(baseline|middle|sub|super|text-top|text-bottom|top|bottom)$/
43
+ {alignment: str}
44
+ when /^(\d*)(?:x(\d+))?px$/
45
+ {width: $1, height: $2}
46
+ when /^link=(.*)$/i
47
+ {link: $1}
48
+ when /^alt=(.*)$/i
49
+ {alt: $1}
50
+ else # text-only caption
51
+ {caption: nodes}
52
+ end
53
+ else # it's caption, and can have inline markup!
54
+ {caption: nodes}
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,142 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Inline
5
+ include Tree
6
+
7
+ def inline(until_pattern = nil)
8
+ start = @context.lineno
9
+ nodes = Nodes[]
10
+ guarded_loop do
11
+ chunk = @context.scan_until(re.inline_until_cache[until_pattern])
12
+ nodes << chunk
13
+
14
+ break if @context.matched_inline?(until_pattern)
15
+
16
+ nodes << inline_formatting(@context.matched) unless @context.matched.empty?
17
+
18
+ if @context.eof?
19
+ break unless until_pattern
20
+ @context.fail!("#{until_pattern} not found, starting from #{start}")
21
+ end
22
+
23
+ if @context.eol?
24
+ nodes << "\n"
25
+ @context.next!
26
+ end
27
+ end
28
+
29
+ nodes
30
+ end
31
+
32
+ def short_inline(until_pattern = nil)
33
+ nodes = Nodes[]
34
+ guarded_loop do
35
+ chunk = @context.scan_until(re.short_inline_until_cache[until_pattern])
36
+ nodes << chunk
37
+
38
+ break if @context.matched_inline?(until_pattern)
39
+
40
+ nodes << inline_formatting(@context.matched)
41
+
42
+ break if @context.inline_eol?(until_pattern)
43
+ end
44
+
45
+ nodes
46
+ end
47
+
48
+ def long_inline(until_pattern = nil)
49
+ nodes = Nodes[]
50
+ guarded_loop do
51
+ chunk = @context.scan_until(re.inline_until_cache[until_pattern])
52
+ nodes << chunk
53
+
54
+ break if @context.matched?(until_pattern)
55
+
56
+ nodes << inline_formatting(@context.matched) unless @context.matched.empty?
57
+
58
+ if @context.eof?
59
+ break unless until_pattern
60
+ @context.fail!("#{until_pattern} not found")
61
+ end
62
+
63
+ if @context.eol?
64
+ @context.next!
65
+ paragraphs(until_pattern).each do |p|
66
+ nodes << p
67
+ end
68
+ break
69
+ end
70
+ end
71
+
72
+ nodes
73
+ end
74
+
75
+ private
76
+ def inline_formatting(match)
77
+ case match
78
+ when "'''''"
79
+ BoldItalic.new(short_inline(/'''''/))
80
+ when "'''"
81
+ Bold.new(short_inline(/'''/))
82
+ when "''"
83
+ Italic.new(short_inline(/''/))
84
+ when '[['
85
+ if @context.check(re.file_prefix)
86
+ image
87
+ else
88
+ wikilink
89
+ end
90
+ when /\[(.+)/
91
+ external_link($1)
92
+ when '{{'
93
+ template
94
+ when /<nowiki([^>]*)>/
95
+ nowiki
96
+ when /<ref([^>]*)\/>/
97
+ reference($1, true)
98
+ when /<ref([^>]*)>/
99
+ reference($1)
100
+ when '<'
101
+ html || Text.new(match) # it was not HTML, just accidental <
102
+ else
103
+ match # FIXME: TEMP
104
+ end
105
+ end
106
+
107
+ # http://en.wikipedia.org/wiki/Help:Link#Wikilinks
108
+ # [[abc]]
109
+ # [[a|b]]
110
+ def wikilink
111
+ link = @context.scan_continued_until(/\||\]\]/)
112
+ caption = inline(/\]\]/) if @context.matched == '|'
113
+ Wikilink.new(link, caption)
114
+ end
115
+
116
+ # http://en.wikipedia.org/wiki/Help:Link#External_links
117
+ # [http://www.example.org]
118
+ # [http://www.example.org link name]
119
+ def external_link(protocol)
120
+ link = @context.scan_continued_until(/\s+|\]/)
121
+ caption = inline(/\]/) if @context.matched =~ /\s+/
122
+ ExternalLink.new(protocol + link, caption)
123
+ end
124
+
125
+ def reference(param_str, closed = false)
126
+ children = closed ? Nodes[] : long_inline(/<\/ref>/)
127
+ Ref.new(children, parse_params(param_str))
128
+ end
129
+
130
+ def nowiki
131
+ Text.new(@context.scan_continued_until(/<\/nowiki>/))
132
+ end
133
+ end
134
+
135
+ require_relative 'image'
136
+ require_relative 'html'
137
+ require_relative 'template'
138
+ include Infoboxer::Parser::Image
139
+ include Infoboxer::Parser::HTML
140
+ include Infoboxer::Parser::Template
141
+ end
142
+ end
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Paragraphs
5
+ include Tree
6
+
7
+ def paragraphs(until_pattern = nil)
8
+ nodes = Nodes[]
9
+ until @context.eof?
10
+ nodes << paragraph(until_pattern)
11
+
12
+ break if until_pattern && @context.matched?(until_pattern)
13
+
14
+ @context.next!
15
+ end
16
+ nodes.flow_templates
17
+ end
18
+
19
+ private
20
+
21
+ def paragraph(until_pattern)
22
+ case @context.current
23
+ when /^(?<level>={2,})\s*(?<text>.+?)\s*\k<level>$/
24
+ heading(Regexp.last_match[:text], Regexp.last_match[:level])
25
+ when /^\s*{\|/
26
+ table
27
+ when /^[\*\#:;]./
28
+ list(until_pattern)
29
+ when /^-{4,}/
30
+ HR.new
31
+ when /^\s*$/
32
+ # will, when merged, close previous paragraph or add spaces to <pre>
33
+ EmptyParagraph.new(@context.current)
34
+ when /^ (?!\s*{{)/ # Lookahead, because spaces before template are ignored
35
+ pre(until_pattern)
36
+ else
37
+ Paragraph.new(short_inline(until_pattern))
38
+ end
39
+ end
40
+
41
+ def heading(text, level)
42
+ Heading.new(Parser.inline(text), level.length)
43
+ end
44
+
45
+ # http://en.wikipedia.org/wiki/Help:List
46
+ def list(until_pattern)
47
+ marker = @context.scan(/^([*\#:;]+)\s*/).strip
48
+ List.construct(marker.chars.to_a, short_inline(until_pattern))
49
+ end
50
+
51
+ # FIXME: in fact, there's some formatting, that should work inside pre
52
+ def pre(until_pattern)
53
+ @context.skip(/^ /)
54
+ str = if until_pattern
55
+ @context.scan_until(/(#{until_pattern}|$)/)
56
+ else
57
+ @context.current
58
+ end
59
+ Pre.new(Nodes[Text.new(str)])
60
+ end
61
+
62
+ require_relative 'table'
63
+ include Parser::Table
64
+ end
65
+ end
66
+ end