infoboxer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.dokaz +1 -0
  3. data/.yardopts +1 -0
  4. data/LICENSE.txt +22 -0
  5. data/Parsing.md +33 -0
  6. data/README.md +115 -0
  7. data/examples/output/.gitkeep +0 -0
  8. data/examples/pages/argentina.wiki +808 -0
  9. data/examples/to_text.rb +8 -0
  10. data/examples/tree.rb +8 -0
  11. data/infoboxer.gemspec +43 -0
  12. data/lib/infoboxer.rb +196 -0
  13. data/lib/infoboxer/core_ext.rb +10 -0
  14. data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
  15. data/lib/infoboxer/media_wiki.rb +162 -0
  16. data/lib/infoboxer/media_wiki/page.rb +38 -0
  17. data/lib/infoboxer/media_wiki/traits.rb +60 -0
  18. data/lib/infoboxer/navigation.rb +84 -0
  19. data/lib/infoboxer/navigation/lookup.rb +216 -0
  20. data/lib/infoboxer/navigation/sections.rb +179 -0
  21. data/lib/infoboxer/navigation/selector.rb +59 -0
  22. data/lib/infoboxer/navigation/shortcuts.rb +165 -0
  23. data/lib/infoboxer/parser.rb +71 -0
  24. data/lib/infoboxer/parser/context.rb +165 -0
  25. data/lib/infoboxer/parser/html.rb +58 -0
  26. data/lib/infoboxer/parser/image.rb +59 -0
  27. data/lib/infoboxer/parser/inline.rb +142 -0
  28. data/lib/infoboxer/parser/paragraphs.rb +66 -0
  29. data/lib/infoboxer/parser/table.rb +132 -0
  30. data/lib/infoboxer/parser/template.rb +47 -0
  31. data/lib/infoboxer/parser/util.rb +73 -0
  32. data/lib/infoboxer/templates.rb +10 -0
  33. data/lib/infoboxer/templates/base.rb +82 -0
  34. data/lib/infoboxer/templates/set.rb +72 -0
  35. data/lib/infoboxer/tree.rb +70 -0
  36. data/lib/infoboxer/tree/compound.rb +81 -0
  37. data/lib/infoboxer/tree/document.rb +11 -0
  38. data/lib/infoboxer/tree/html.rb +76 -0
  39. data/lib/infoboxer/tree/image.rb +53 -0
  40. data/lib/infoboxer/tree/inline.rb +39 -0
  41. data/lib/infoboxer/tree/list.rb +160 -0
  42. data/lib/infoboxer/tree/node.rb +181 -0
  43. data/lib/infoboxer/tree/nodes.rb +185 -0
  44. data/lib/infoboxer/tree/paragraphs.rb +122 -0
  45. data/lib/infoboxer/tree/ref.rb +34 -0
  46. data/lib/infoboxer/tree/table.rb +89 -0
  47. data/lib/infoboxer/tree/template.rb +82 -0
  48. data/lib/infoboxer/tree/text.rb +60 -0
  49. data/lib/infoboxer/tree/wikilink.rb +83 -0
  50. data/lib/infoboxer/version.rb +4 -0
  51. data/profile/out/.gitkeep +0 -0
  52. data/profile/pages/argentina.txt +808 -0
  53. data/profile/pages/canada.wiki +544 -0
  54. data/profile/pages/ukraine.wiki +1006 -0
  55. data/profile/pages/usa.wiki +843 -0
  56. data/regression/pages/canada.wiki +544 -0
  57. data/regression/pages/chiang_mai.wiki +2615 -0
  58. data/regression/pages/south_america.wiki +640 -0
  59. data/regression/pages/ukraine.wiki +1006 -0
  60. data/regression/pages/usa.wiki +843 -0
  61. metadata +272 -0
@@ -0,0 +1,165 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ class Context
5
+ attr_reader :lineno
6
+ attr_reader :traits
7
+
8
+ def initialize(text, traits = nil)
9
+ @lines = text.
10
+ gsub(/<!--.+?-->/m, ''). # FIXME: will also kill comments inside <nowiki> tag
11
+ split(/[\r\n]/)
12
+ @lineno = -1
13
+ @traits = traits || MediaWiki::Traits.default
14
+ @scanner = StringScanner.new('')
15
+ next!
16
+ end
17
+
18
+ attr_reader :next_lines
19
+
20
+ def colno
21
+ @scanner && @scanner.pos || 0
22
+ end
23
+
24
+ def matched
25
+ @matched ||= @scanner && @scanner.matched
26
+ end
27
+
28
+ # check which works only once
29
+ def eat_matched?(str)
30
+ return false unless matched == str
31
+ @matched = 'DUMMY'
32
+ true
33
+ end
34
+
35
+ def rest
36
+ @rest ||= @scanner && @scanner.rest
37
+ end
38
+
39
+ alias_method :current, :rest
40
+
41
+ # lines navigation
42
+ def next!
43
+ shift(+1)
44
+ end
45
+
46
+ def prev!
47
+ shift(-1)
48
+ end
49
+
50
+ def eof?
51
+ !next_lines || # we are after the file end
52
+ next_lines.empty? && eol?
53
+ end
54
+
55
+ def inspect
56
+ "#<Context(line #{lineno} of #{@lines.count}: #{current})>"
57
+ end
58
+
59
+ # scanning
60
+ def scan(re)
61
+ res = @scanner.scan(re)
62
+ @matched = nil
63
+ @rest = nil
64
+ res
65
+ end
66
+
67
+ def check(re)
68
+ res = @scanner.check(re)
69
+ @matched = nil
70
+ @rest = nil
71
+ res
72
+ end
73
+
74
+ def skip(re)
75
+ res = @scanner.skip(re)
76
+ @matched = nil
77
+ @rest = nil
78
+ res
79
+ end
80
+
81
+ def scan_until(re, leave_pattern = false)
82
+ guard_eof!
83
+
84
+ res = _scan_until(re)
85
+ res[matched] = '' if res && !leave_pattern
86
+ res
87
+ end
88
+
89
+ def inline_eol?(exclude = nil)
90
+ # not using StringScanner#check, as it will change #matched value
91
+ eol? ||
92
+ (current =~ %r[^(</ref>|}})] &&
93
+ (!exclude || $1 !~ exclude)) # FIXME: ugly, but no idea of prettier solution
94
+ end
95
+
96
+ def scan_continued_until(re, leave_pattern = false)
97
+ res = ''
98
+
99
+ loop do
100
+ chunk = _scan_until(re)
101
+ case matched
102
+ when re
103
+ res << chunk
104
+ break
105
+ when nil
106
+ res << rest << "\n"
107
+ next!
108
+ eof? && fail!("Unfinished scan: #{re} not found")
109
+ end
110
+ end
111
+
112
+ res[/#{re}\Z/] = '' unless leave_pattern
113
+ res
114
+ end
115
+
116
+ # state inspection
117
+ def matched_inline?(re)
118
+ re.nil? ? (matched.empty? && eol?) : matched =~ re
119
+ end
120
+
121
+ def matched?(re)
122
+ re && matched =~ re
123
+ end
124
+
125
+ def eol?
126
+ !current || current.empty?
127
+ end
128
+
129
+ # basic services
130
+ def fail!(text)
131
+ fail(ParsingError, "#{text} at line #{@lineno}:\n\t#{current}")
132
+ end
133
+
134
+ private
135
+
136
+ # we do hard use of #matched and #rest, its wiser to memoize them
137
+ def _scan_until(re)
138
+ res = @scanner.scan_until(re)
139
+ @matched = nil
140
+ @rest = nil
141
+ res
142
+ end
143
+
144
+ def guard_eof!
145
+ #eof? and fail!("End of input reached")
146
+ @scanner or fail!("End of input reached")
147
+ end
148
+
149
+ def shift(amount)
150
+ @lineno += amount
151
+ current = @lines[lineno]
152
+ @next_lines = @lines[(lineno+1)..-1]
153
+ if current
154
+ @scanner.string = current
155
+ @rest = current
156
+ @matched = nil
157
+ else
158
+ @scanner = nil
159
+ @rest = nil
160
+ @matched = nil
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,58 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module HTML
5
+ include Tree
6
+
7
+ def html
8
+ case
9
+ when @context.check(/\/[a-z]+>/)
10
+ html_closing_tag
11
+ when @context.check(/br\s*>/)
12
+ html_br
13
+ when @context.check(%r{[a-z]+[^/>]*/>})
14
+ html_auto_closing_tag
15
+ when @context.check(/[a-z]+[^>\/]*>/)
16
+ html_opening_tag
17
+ else
18
+ # not an HTML tag at all!
19
+ nil
20
+ end
21
+ end
22
+
23
+ def html_closing_tag
24
+ @context.skip(/\//)
25
+ tag = @context.scan(/[a-z]+/)
26
+ @context.skip(/>/)
27
+ HTMLClosingTag.new(tag)
28
+ end
29
+
30
+ def html_br
31
+ @context.skip(/br\s*>/)
32
+ HTMLTag.new('br', {})
33
+ end
34
+
35
+ def html_auto_closing_tag
36
+ tag = @context.scan(/[a-z]+/)
37
+ attrs = @context.scan(%r{[^/>]*})
38
+ @context.skip(%r{/>})
39
+ HTMLTag.new(tag, parse_params(attrs))
40
+ end
41
+
42
+ def html_opening_tag
43
+ tag = @context.scan(/[a-z]+/)
44
+ attrs = @context.scan(/[^>]+/)
45
+ @context.skip(/>/)
46
+ contents = short_inline(/<\/#{tag}>/)
47
+ if @context.matched =~ /<\/#{tag}>/
48
+ HTMLTag.new(tag, parse_params(attrs), contents)
49
+ else
50
+ [
51
+ HTMLOpeningTag.new(tag, parse_params(attrs)),
52
+ *contents
53
+ ]
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,59 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Image
5
+ include Tree
6
+
7
+ def image
8
+ @context.skip(re.file_prefix) or
9
+ @context.fail!("Something went wrong: it's not image?")
10
+
11
+ path = @context.scan_until(/\||\]\]/)
12
+ attrs = if @context.matched == '|'
13
+ image_attrs
14
+ else
15
+ {}
16
+ end
17
+ Tree::Image.new(path, attrs)
18
+ end
19
+
20
+ def image_attrs
21
+ nodes = []
22
+
23
+ loop do
24
+ nodes << long_inline(/\||\]\]/)
25
+ break if @context.matched == ']]'
26
+ end
27
+
28
+ nodes.map(&method(:image_attr)).
29
+ inject(&:merge).
30
+ reject{|k, v| v.nil? || v.empty?}
31
+ end
32
+
33
+ def image_attr(nodes)
34
+ if nodes.count == 1 && nodes.first.is_a?(Text)
35
+ case (str = nodes.first.text)
36
+ when /^(thumb)(?:nail)?$/, /^(frame)(?:d)?$/
37
+ {type: $1}
38
+ when 'frameless'
39
+ {type: str}
40
+ when 'border'
41
+ {border: str}
42
+ when /^(baseline|middle|sub|super|text-top|text-bottom|top|bottom)$/
43
+ {alignment: str}
44
+ when /^(\d*)(?:x(\d+))?px$/
45
+ {width: $1, height: $2}
46
+ when /^link=(.*)$/i
47
+ {link: $1}
48
+ when /^alt=(.*)$/i
49
+ {alt: $1}
50
+ else # text-only caption
51
+ {caption: nodes}
52
+ end
53
+ else # it's caption, and can have inline markup!
54
+ {caption: nodes}
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,142 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Inline
5
+ include Tree
6
+
7
+ def inline(until_pattern = nil)
8
+ start = @context.lineno
9
+ nodes = Nodes[]
10
+ guarded_loop do
11
+ chunk = @context.scan_until(re.inline_until_cache[until_pattern])
12
+ nodes << chunk
13
+
14
+ break if @context.matched_inline?(until_pattern)
15
+
16
+ nodes << inline_formatting(@context.matched) unless @context.matched.empty?
17
+
18
+ if @context.eof?
19
+ break unless until_pattern
20
+ @context.fail!("#{until_pattern} not found, starting from #{start}")
21
+ end
22
+
23
+ if @context.eol?
24
+ nodes << "\n"
25
+ @context.next!
26
+ end
27
+ end
28
+
29
+ nodes
30
+ end
31
+
32
+ def short_inline(until_pattern = nil)
33
+ nodes = Nodes[]
34
+ guarded_loop do
35
+ chunk = @context.scan_until(re.short_inline_until_cache[until_pattern])
36
+ nodes << chunk
37
+
38
+ break if @context.matched_inline?(until_pattern)
39
+
40
+ nodes << inline_formatting(@context.matched)
41
+
42
+ break if @context.inline_eol?(until_pattern)
43
+ end
44
+
45
+ nodes
46
+ end
47
+
48
+ def long_inline(until_pattern = nil)
49
+ nodes = Nodes[]
50
+ guarded_loop do
51
+ chunk = @context.scan_until(re.inline_until_cache[until_pattern])
52
+ nodes << chunk
53
+
54
+ break if @context.matched?(until_pattern)
55
+
56
+ nodes << inline_formatting(@context.matched) unless @context.matched.empty?
57
+
58
+ if @context.eof?
59
+ break unless until_pattern
60
+ @context.fail!("#{until_pattern} not found")
61
+ end
62
+
63
+ if @context.eol?
64
+ @context.next!
65
+ paragraphs(until_pattern).each do |p|
66
+ nodes << p
67
+ end
68
+ break
69
+ end
70
+ end
71
+
72
+ nodes
73
+ end
74
+
75
+ private
76
+ def inline_formatting(match)
77
+ case match
78
+ when "'''''"
79
+ BoldItalic.new(short_inline(/'''''/))
80
+ when "'''"
81
+ Bold.new(short_inline(/'''/))
82
+ when "''"
83
+ Italic.new(short_inline(/''/))
84
+ when '[['
85
+ if @context.check(re.file_prefix)
86
+ image
87
+ else
88
+ wikilink
89
+ end
90
+ when /\[(.+)/
91
+ external_link($1)
92
+ when '{{'
93
+ template
94
+ when /<nowiki([^>]*)>/
95
+ nowiki
96
+ when /<ref([^>]*)\/>/
97
+ reference($1, true)
98
+ when /<ref([^>]*)>/
99
+ reference($1)
100
+ when '<'
101
+ html || Text.new(match) # it was not HTML, just accidental <
102
+ else
103
+ match # FIXME: TEMP
104
+ end
105
+ end
106
+
107
+ # http://en.wikipedia.org/wiki/Help:Link#Wikilinks
108
+ # [[abc]]
109
+ # [[a|b]]
110
+ def wikilink
111
+ link = @context.scan_continued_until(/\||\]\]/)
112
+ caption = inline(/\]\]/) if @context.matched == '|'
113
+ Wikilink.new(link, caption)
114
+ end
115
+
116
+ # http://en.wikipedia.org/wiki/Help:Link#External_links
117
+ # [http://www.example.org]
118
+ # [http://www.example.org link name]
119
+ def external_link(protocol)
120
+ link = @context.scan_continued_until(/\s+|\]/)
121
+ caption = inline(/\]/) if @context.matched =~ /\s+/
122
+ ExternalLink.new(protocol + link, caption)
123
+ end
124
+
125
+ def reference(param_str, closed = false)
126
+ children = closed ? Nodes[] : long_inline(/<\/ref>/)
127
+ Ref.new(children, parse_params(param_str))
128
+ end
129
+
130
+ def nowiki
131
+ Text.new(@context.scan_continued_until(/<\/nowiki>/))
132
+ end
133
+ end
134
+
135
+ require_relative 'image'
136
+ require_relative 'html'
137
+ require_relative 'template'
138
+ include Infoboxer::Parser::Image
139
+ include Infoboxer::Parser::HTML
140
+ include Infoboxer::Parser::Template
141
+ end
142
+ end
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+ module Infoboxer
3
+ class Parser
4
+ module Paragraphs
5
+ include Tree
6
+
7
+ def paragraphs(until_pattern = nil)
8
+ nodes = Nodes[]
9
+ until @context.eof?
10
+ nodes << paragraph(until_pattern)
11
+
12
+ break if until_pattern && @context.matched?(until_pattern)
13
+
14
+ @context.next!
15
+ end
16
+ nodes.flow_templates
17
+ end
18
+
19
+ private
20
+
21
+ def paragraph(until_pattern)
22
+ case @context.current
23
+ when /^(?<level>={2,})\s*(?<text>.+?)\s*\k<level>$/
24
+ heading(Regexp.last_match[:text], Regexp.last_match[:level])
25
+ when /^\s*{\|/
26
+ table
27
+ when /^[\*\#:;]./
28
+ list(until_pattern)
29
+ when /^-{4,}/
30
+ HR.new
31
+ when /^\s*$/
32
+ # will, when merged, close previous paragraph or add spaces to <pre>
33
+ EmptyParagraph.new(@context.current)
34
+ when /^ (?!\s*{{)/ # Lookahead, because spaces before template are ignored
35
+ pre(until_pattern)
36
+ else
37
+ Paragraph.new(short_inline(until_pattern))
38
+ end
39
+ end
40
+
41
+ def heading(text, level)
42
+ Heading.new(Parser.inline(text), level.length)
43
+ end
44
+
45
+ # http://en.wikipedia.org/wiki/Help:List
46
+ def list(until_pattern)
47
+ marker = @context.scan(/^([*\#:;]+)\s*/).strip
48
+ List.construct(marker.chars.to_a, short_inline(until_pattern))
49
+ end
50
+
51
+ # FIXME: in fact, there's some formatting, that should work inside pre
52
+ def pre(until_pattern)
53
+ @context.skip(/^ /)
54
+ str = if until_pattern
55
+ @context.scan_until(/(#{until_pattern}|$)/)
56
+ else
57
+ @context.current
58
+ end
59
+ Pre.new(Nodes[Text.new(str)])
60
+ end
61
+
62
+ require_relative 'table'
63
+ include Parser::Table
64
+ end
65
+ end
66
+ end