RubyGems - infoboxer - Versions diffs - 0.1.0 - Mend

infoboxer 0.1.0

Files changed (61) hide show

checksums.yaml +7 -0
data/.dokaz +1 -0
data/.yardopts +1 -0
data/LICENSE.txt +22 -0
data/Parsing.md +33 -0
data/README.md +115 -0
data/examples/output/.gitkeep +0 -0
data/examples/pages/argentina.wiki +808 -0
data/examples/to_text.rb +8 -0
data/examples/tree.rb +8 -0
data/infoboxer.gemspec +43 -0
data/lib/infoboxer.rb +196 -0
data/lib/infoboxer/core_ext.rb +10 -0
data/lib/infoboxer/definitions/en.wikipedia.org.rb +355 -0
data/lib/infoboxer/media_wiki.rb +162 -0
data/lib/infoboxer/media_wiki/page.rb +38 -0
data/lib/infoboxer/media_wiki/traits.rb +60 -0
data/lib/infoboxer/navigation.rb +84 -0
data/lib/infoboxer/navigation/lookup.rb +216 -0
data/lib/infoboxer/navigation/sections.rb +179 -0
data/lib/infoboxer/navigation/selector.rb +59 -0
data/lib/infoboxer/navigation/shortcuts.rb +165 -0
data/lib/infoboxer/parser.rb +71 -0
data/lib/infoboxer/parser/context.rb +165 -0
data/lib/infoboxer/parser/html.rb +58 -0
data/lib/infoboxer/parser/image.rb +59 -0
data/lib/infoboxer/parser/inline.rb +142 -0
data/lib/infoboxer/parser/paragraphs.rb +66 -0
data/lib/infoboxer/parser/table.rb +132 -0
data/lib/infoboxer/parser/template.rb +47 -0
data/lib/infoboxer/parser/util.rb +73 -0
data/lib/infoboxer/templates.rb +10 -0
data/lib/infoboxer/templates/base.rb +82 -0
data/lib/infoboxer/templates/set.rb +72 -0
data/lib/infoboxer/tree.rb +70 -0
data/lib/infoboxer/tree/compound.rb +81 -0
data/lib/infoboxer/tree/document.rb +11 -0
data/lib/infoboxer/tree/html.rb +76 -0
data/lib/infoboxer/tree/image.rb +53 -0
data/lib/infoboxer/tree/inline.rb +39 -0
data/lib/infoboxer/tree/list.rb +160 -0
data/lib/infoboxer/tree/node.rb +181 -0
data/lib/infoboxer/tree/nodes.rb +185 -0
data/lib/infoboxer/tree/paragraphs.rb +122 -0
data/lib/infoboxer/tree/ref.rb +34 -0
data/lib/infoboxer/tree/table.rb +89 -0
data/lib/infoboxer/tree/template.rb +82 -0
data/lib/infoboxer/tree/text.rb +60 -0
data/lib/infoboxer/tree/wikilink.rb +83 -0
data/lib/infoboxer/version.rb +4 -0
data/profile/out/.gitkeep +0 -0
data/profile/pages/argentina.txt +808 -0
data/profile/pages/canada.wiki +544 -0
data/profile/pages/ukraine.wiki +1006 -0
data/profile/pages/usa.wiki +843 -0
data/regression/pages/canada.wiki +544 -0
data/regression/pages/chiang_mai.wiki +2615 -0
data/regression/pages/south_america.wiki +640 -0
data/regression/pages/ukraine.wiki +1006 -0
data/regression/pages/usa.wiki +843 -0
metadata +272 -0

data/lib/infoboxer/parser/context.rb ADDED

@@ -0,0 +1,165 @@
+# encoding: utf-8
+module Infoboxer
+  class Parser
+    class Context
+      attr_reader :lineno
+      attr_reader :traits
+      def initialize(text, traits = nil)
+        @lines = text.
+          gsub(/<!--.+?-->/m, ''). # FIXME: will also kill comments inside <nowiki> tag
+          split(/[\r\n]/)
+        @lineno = -1
+        @traits = traits || MediaWiki::Traits.default
+        @scanner = StringScanner.new('')
+        next!
+      end
+      attr_reader :next_lines
+      def colno
+        @scanner && @scanner.pos || 0
+      end
+      def matched
+        @matched ||= @scanner && @scanner.matched
+      end
+      # check which works only once
+      def eat_matched?(str)
+        return false unless matched == str
+        @matched = 'DUMMY'
+        true
+      end
+      def rest
+        @rest ||= @scanner && @scanner.rest
+      end
+      alias_method :current, :rest
+      # lines navigation
+      def next!
+        shift(+1)
+      end
+      def prev!
+        shift(-1)
+      end
+      def eof?
+        !next_lines || # we are after the file end
+          next_lines.empty? && eol?
+      end
+      def inspect
+        "#<Context(line #{lineno} of #{@lines.count}: #{current})>"
+      end
+      # scanning
+      def scan(re)
+        res = @scanner.scan(re)
+        @matched = nil
+        @rest = nil
+        res
+      end
+      def check(re)
+        res = @scanner.check(re)
+        @matched = nil
+        @rest = nil
+        res
+      end
+      def skip(re)
+        res = @scanner.skip(re)
+        @matched = nil
+        @rest = nil
+        res
+      end
+      def scan_until(re, leave_pattern = false)
+        guard_eof!
+        res = _scan_until(re)
+        res[matched] = '' if res && !leave_pattern
+        res
+      end
+      def inline_eol?(exclude = nil)
+        # not using StringScanner#check, as it will change #matched value
+        eol? ||
+          (current =~ %r[^(</ref>|}})] &&
+            (!exclude || $1 !~ exclude)) # FIXME: ugly, but no idea of prettier solution
+      end
+      def scan_continued_until(re, leave_pattern = false)
+        res = ''
+        loop do
+          chunk = _scan_until(re)
+          case matched
+          when re
+            res << chunk
+            break
+          when nil
+            res << rest << "\n"
+            next!
+            eof? && fail!("Unfinished scan: #{re} not found")
+          end
+        end
+        res[/#{re}\Z/] = '' unless leave_pattern
+        res
+      end
+      # state inspection
+      def matched_inline?(re)
+        re.nil? ? (matched.empty? && eol?) : matched =~ re
+      end
+      def matched?(re)
+        re && matched =~ re
+      end
+      def eol?
+        !current || current.empty?
+      end
+      # basic services
+      def fail!(text)
+        fail(ParsingError, "#{text} at line #{@lineno}:\n\t#{current}")
+      end
+      private
+      # we do hard use of #matched and #rest, its wiser to memoize them
+      def _scan_until(re)
+        res = @scanner.scan_until(re)
+        @matched = nil
+        @rest = nil
+        res
+      end
+      def guard_eof!
+        #eof? and fail!("End of input reached")
+        @scanner or fail!("End of input reached")
+      end
+      def shift(amount)
+        @lineno += amount
+        current = @lines[lineno]
+        @next_lines = @lines[(lineno+1)..-1]
+        if current
+          @scanner.string = current
+          @rest = current
+          @matched = nil
+        else
+          @scanner = nil
+          @rest = nil
+          @matched = nil
+        end
+      end
+    end
+  end
+end

data/lib/infoboxer/parser/html.rb ADDED

@@ -0,0 +1,58 @@
+# encoding: utf-8
+module Infoboxer
+  class Parser
+    module HTML
+      include Tree
+      def html
+        case
+        when @context.check(/\/[a-z]+>/)
+          html_closing_tag
+        when @context.check(/br\s*>/)
+          html_br
+        when @context.check(%r{[a-z]+[^/>]*/>})
+          html_auto_closing_tag
+        when @context.check(/[a-z]+[^>\/]*>/)
+          html_opening_tag
+        else
+          # not an HTML tag at all!
+          nil
+        end
+      end
+      def html_closing_tag
+        @context.skip(/\//)
+        tag = @context.scan(/[a-z]+/)
+        @context.skip(/>/)
+        HTMLClosingTag.new(tag)
+      end
+      def html_br
+        @context.skip(/br\s*>/)
+        HTMLTag.new('br', {})
+      end
+      def html_auto_closing_tag
+        tag = @context.scan(/[a-z]+/)
+        attrs = @context.scan(%r{[^/>]*})
+        @context.skip(%r{/>})
+        HTMLTag.new(tag, parse_params(attrs))
+      end
+      def html_opening_tag
+        tag = @context.scan(/[a-z]+/)
+        attrs = @context.scan(/[^>]+/)
+        @context.skip(/>/)
+        contents = short_inline(/<\/#{tag}>/)
+        if @context.matched =~ /<\/#{tag}>/
+          HTMLTag.new(tag, parse_params(attrs), contents)
+        else
+          [
+            HTMLOpeningTag.new(tag, parse_params(attrs)),
+            *contents
+          ]
+        end
+      end
+    end
+  end
+end

data/lib/infoboxer/parser/image.rb ADDED

@@ -0,0 +1,59 @@
+# encoding: utf-8
+module Infoboxer
+  class Parser
+    module Image
+      include Tree
+      def image
+        @context.skip(re.file_prefix) or
+          @context.fail!("Something went wrong: it's not image?")
+        path = @context.scan_until(/\||\]\]/)
+        attrs = if @context.matched == '|'
+          image_attrs
+        else
+          {}
+        end
+        Tree::Image.new(path, attrs)
+      end
+      def image_attrs
+        nodes = []
+        loop do
+          nodes << long_inline(/\||\]\]/)
+          break if @context.matched == ']]'
+        end
+        nodes.map(&method(:image_attr)).
+          inject(&:merge).
+          reject{|k, v| v.nil? || v.empty?}
+      end
+      def image_attr(nodes)
+        if nodes.count == 1 && nodes.first.is_a?(Text)
+          case (str = nodes.first.text)
+          when /^(thumb)(?:nail)?$/, /^(frame)(?:d)?$/
+            {type: $1}
+          when 'frameless'
+            {type: str}
+          when 'border'
+            {border: str}
+          when /^(baseline|middle|sub|super|text-top|text-bottom|top|bottom)$/
+            {alignment: str}
+          when /^(\d*)(?:x(\d+))?px$/
+            {width: $1, height: $2}
+          when /^link=(.*)$/i
+            {link: $1}
+          when /^alt=(.*)$/i
+            {alt: $1}
+          else # text-only caption
+            {caption: nodes}
+          end
+        else # it's caption, and can have inline markup!
+          {caption: nodes}
+        end
+      end
+    end
+  end
+end

data/lib/infoboxer/parser/inline.rb ADDED

@@ -0,0 +1,142 @@
+# encoding: utf-8
+module Infoboxer
+  class Parser
+    module Inline
+      include Tree
+      def inline(until_pattern = nil)
+        start = @context.lineno
+        nodes = Nodes[]
+        guarded_loop do
+          chunk = @context.scan_until(re.inline_until_cache[until_pattern])
+          nodes << chunk
+          break if @context.matched_inline?(until_pattern)
+          nodes << inline_formatting(@context.matched) unless @context.matched.empty?
+          if @context.eof?
+            break unless until_pattern
+            @context.fail!("#{until_pattern} not found, starting from #{start}")
+          end
+          if @context.eol?
+            nodes << "\n"
+            @context.next!
+          end
+        end
+        nodes
+      end
+      def short_inline(until_pattern = nil)
+        nodes = Nodes[]
+        guarded_loop do
+          chunk = @context.scan_until(re.short_inline_until_cache[until_pattern])
+          nodes << chunk
+          break if @context.matched_inline?(until_pattern)
+          nodes << inline_formatting(@context.matched)
+          break if @context.inline_eol?(until_pattern)
+        end
+        nodes
+      end
+      def long_inline(until_pattern = nil)
+        nodes = Nodes[]
+        guarded_loop do
+          chunk = @context.scan_until(re.inline_until_cache[until_pattern])
+          nodes << chunk
+          break if @context.matched?(until_pattern)
+          nodes << inline_formatting(@context.matched) unless @context.matched.empty?
+          if @context.eof?
+            break unless until_pattern
+            @context.fail!("#{until_pattern} not found")
+          end
+          if @context.eol?
+            @context.next!
+            paragraphs(until_pattern).each do |p|
+              nodes << p
+            end
+            break
+          end
+        end
+        nodes
+      end
+      private
+        def inline_formatting(match)
+          case match
+          when "'''''"
+            BoldItalic.new(short_inline(/'''''/))
+          when "'''"
+            Bold.new(short_inline(/'''/))
+          when "''"
+            Italic.new(short_inline(/''/))
+          when '[['
+            if @context.check(re.file_prefix)
+              image
+            else
+              wikilink
+            end
+          when /\[(.+)/
+            external_link($1)
+          when '{{'
+            template
+          when /<nowiki([^>]*)>/
+            nowiki
+          when /<ref([^>]*)\/>/
+            reference($1, true)
+          when /<ref([^>]*)>/
+            reference($1)
+          when '<'
+            html || Text.new(match) # it was not HTML, just accidental <
+          else
+            match # FIXME: TEMP
+          end
+        end
+        # http://en.wikipedia.org/wiki/Help:Link#Wikilinks
+        # [[abc]]
+        # [[a|b]]
+        def wikilink
+          link = @context.scan_continued_until(/\||\]\]/)
+          caption = inline(/\]\]/) if @context.matched == '|'
+          Wikilink.new(link, caption)
+        end
+        # http://en.wikipedia.org/wiki/Help:Link#External_links
+        # [http://www.example.org]
+        # [http://www.example.org link name]
+        def external_link(protocol)
+          link = @context.scan_continued_until(/\s+|\]/)
+          caption = inline(/\]/) if @context.matched =~ /\s+/
+          ExternalLink.new(protocol + link, caption)
+        end
+        def reference(param_str, closed = false)
+          children = closed ? Nodes[] : long_inline(/<\/ref>/)
+          Ref.new(children, parse_params(param_str))
+        end
+        def nowiki
+          Text.new(@context.scan_continued_until(/<\/nowiki>/))
+        end
+      end
+      require_relative 'image'
+      require_relative 'html'
+      require_relative 'template'
+      include Infoboxer::Parser::Image
+      include Infoboxer::Parser::HTML
+      include Infoboxer::Parser::Template
+  end
+end

data/lib/infoboxer/parser/paragraphs.rb ADDED

@@ -0,0 +1,66 @@
+# encoding: utf-8
+module Infoboxer
+  class Parser
+    module Paragraphs
+      include Tree
+      def paragraphs(until_pattern = nil)
+        nodes = Nodes[]
+        until @context.eof?
+          nodes << paragraph(until_pattern)
+          break if until_pattern && @context.matched?(until_pattern)
+          @context.next!
+        end
+        nodes.flow_templates
+      end
+      private
+        def paragraph(until_pattern)
+          case @context.current
+          when /^(?<level>={2,})\s*(?<text>.+?)\s*\k<level>$/
+            heading(Regexp.last_match[:text], Regexp.last_match[:level])
+          when /^\s*{\|/
+            table
+          when /^[\*\#:;]./
+            list(until_pattern)
+          when /^-{4,}/
+            HR.new
+          when /^\s*$/
+            # will, when merged, close previous paragraph or add spaces to <pre>
+            EmptyParagraph.new(@context.current)
+          when /^ (?!\s*{{)/ # Lookahead, because spaces before template are ignored
+            pre(until_pattern)
+          else
+            Paragraph.new(short_inline(until_pattern))
+          end
+        end
+        def heading(text, level)
+          Heading.new(Parser.inline(text), level.length)
+        end
+        # http://en.wikipedia.org/wiki/Help:List
+        def list(until_pattern)
+          marker = @context.scan(/^([*\#:;]+)\s*/).strip
+          List.construct(marker.chars.to_a, short_inline(until_pattern))
+        end
+        # FIXME: in fact, there's some formatting, that should work inside pre
+        def pre(until_pattern)
+          @context.skip(/^ /)
+          str = if until_pattern
+            @context.scan_until(/(#{until_pattern}|$)/)
+          else
+            @context.current
+          end
+          Pre.new(Nodes[Text.new(str)])
+        end
+      require_relative 'table'
+      include Parser::Table
+    end
+  end
+end