RubyGems - rubyjedi-oga - Versions diffs - 1.0.3 - Mend

rubyjedi-oga 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

checksums.yaml +7 -0
data/.yardopts +13 -0
data/LICENSE +362 -0
data/README.md +317 -0
data/doc/css/common.css +77 -0
data/doc/css_selectors.md +935 -0
data/doc/manually_creating_documents.md +67 -0
data/doc/migrating_from_nokogiri.md +169 -0
data/doc/xml_namespaces.md +63 -0
data/ext/c/extconf.rb +11 -0
data/ext/c/lexer.c +2595 -0
data/ext/c/lexer.h +16 -0
data/ext/c/lexer.rl +198 -0
data/ext/c/liboga.c +6 -0
data/ext/c/liboga.h +11 -0
data/ext/java/Liboga.java +14 -0
data/ext/java/org/liboga/xml/Lexer.java +1363 -0
data/ext/java/org/liboga/xml/Lexer.rl +223 -0
data/ext/ragel/base_lexer.rl +633 -0
data/lib/oga.rb +57 -0
data/lib/oga/blacklist.rb +40 -0
data/lib/oga/css/lexer.rb +743 -0
data/lib/oga/css/parser.rb +976 -0
data/lib/oga/entity_decoder.rb +21 -0
data/lib/oga/html/entities.rb +2150 -0
data/lib/oga/html/parser.rb +25 -0
data/lib/oga/html/sax_parser.rb +18 -0
data/lib/oga/lru.rb +160 -0
data/lib/oga/oga.rb +57 -0
data/lib/oga/version.rb +3 -0
data/lib/oga/whitelist.rb +20 -0
data/lib/oga/xml/attribute.rb +136 -0
data/lib/oga/xml/cdata.rb +17 -0
data/lib/oga/xml/character_node.rb +37 -0
data/lib/oga/xml/comment.rb +17 -0
data/lib/oga/xml/default_namespace.rb +13 -0
data/lib/oga/xml/doctype.rb +82 -0
data/lib/oga/xml/document.rb +108 -0
data/lib/oga/xml/element.rb +428 -0
data/lib/oga/xml/entities.rb +122 -0
data/lib/oga/xml/html_void_elements.rb +15 -0
data/lib/oga/xml/lexer.rb +550 -0
data/lib/oga/xml/namespace.rb +48 -0
data/lib/oga/xml/node.rb +219 -0
data/lib/oga/xml/node_set.rb +333 -0
data/lib/oga/xml/parser.rb +631 -0
data/lib/oga/xml/processing_instruction.rb +37 -0
data/lib/oga/xml/pull_parser.rb +175 -0
data/lib/oga/xml/querying.rb +56 -0
data/lib/oga/xml/sax_parser.rb +192 -0
data/lib/oga/xml/text.rb +66 -0
data/lib/oga/xml/traversal.rb +50 -0
data/lib/oga/xml/xml_declaration.rb +65 -0
data/lib/oga/xpath/evaluator.rb +1798 -0
data/lib/oga/xpath/lexer.rb +1958 -0
data/lib/oga/xpath/parser.rb +622 -0
data/oga.gemspec +45 -0
metadata +227 -0

data/lib/oga/xml/entities.rb ADDED

@@ -0,0 +1,122 @@
+module Oga
+  module XML
+    ##
+    # Module for encoding/decoding XML and HTML entities. The mapping of HTML
+    # entities can be found in {Oga::HTML::Entities::DECODE_MAPPING}.
+    #
+    module Entities
+      ##
+      # Hash containing XML entities and the corresponding characters.
+      #
+      # The `&amp;` mapping must come last to ensure proper conversion of non
+      # encoded to encoded forms (see {Oga::XML::Text#to_xml}).
+      #
+      # @return [Hash]
+      #
+      DECODE_MAPPING = {
+        '&lt;'   => '<',
+        '&gt;'   => '>',
+        '&apos;' => "'",
+        '&quot;' => '"',
+        '&amp;'  => '&',
+      }
+      ##
+      # Hash containing characters and the corresponding XML entities.
+      #
+      # @return [Hash]
+      #
+      ENCODE_MAPPING = {
+        '&' => '&amp;',
+        '>' => '&gt;',
+        '<' => '&lt;',
+      }
+      ##
+      # Hash containing characters and the corresponding XML entities to use
+      # when encoding XML/HTML attribute values.
+      #
+      # @return [Hash]
+      #
+      ENCODE_ATTRIBUTE_MAPPING = {
+        '&' => '&amp;',
+        '>' => '&gt;',
+        '<' => '&lt;',
+        "'" => '&apos;',
+        '"' => '&quot;'
+      }
+      ##
+      # @return [String]
+      #
+      AMPERSAND = '&'.freeze
+      ##
+      # Regexp for matching XML/HTML entities such as "&nbsp;".
+      #
+      # @return [Regexp]
+      #
+      REGULAR_ENTITY = /&[a-zA-Z0-9]+;/
+      ##
+      # Regexp for matching XML/HTML entities such as "&#38;".
+      #
+      # @return [Regexp]
+      #
+      CODEPOINT_ENTITY = /&#(x)?([a-zA-Z0-9]+);/
+      ##
+      # @return [Regexp]
+      #
+      ENCODE_REGEXP = Regexp.new(ENCODE_MAPPING.keys.join('|'))
+      ##
+      # @return [Regexp]
+      #
+      ENCODE_ATTRIBUTE_REGEXP =
+        Regexp.new(ENCODE_ATTRIBUTE_MAPPING.keys.join('|'))
+      ##
+      # Decodes XML entities.
+      #
+      # @param [String] input
+      # @param [Hash] mapping
+      # @return [String]
+      #
+      def self.decode(input, mapping = DECODE_MAPPING)
+        return input unless input.include?(AMPERSAND)
+        input = input.gsub(REGULAR_ENTITY, mapping)
+        if input.include?(AMPERSAND)
+          input = input.gsub(CODEPOINT_ENTITY) do |match|
+            [$1 ? Integer($2, 16) : Integer($2, 10)].pack('U*')
+          end
+        end
+        input
+      end
+      ##
+      # Encodes special characters as XML entities.
+      #
+      # @param [String] input
+      # @param [Hash] mapping
+      # @return [String]
+      #
+      def self.encode(input, mapping = ENCODE_MAPPING)
+        input.gsub(ENCODE_REGEXP, mapping)
+      end
+      ##
+      # Encodes special characters in an XML attribute value.
+      #
+      # @param [String] input
+      # @return [String]
+      #
+      def self.encode_attribute(input)
+        input.gsub(ENCODE_ATTRIBUTE_REGEXP, ENCODE_ATTRIBUTE_MAPPING)
+      end
+    end # Entities
+  end # XML
+end # Oga

data/lib/oga/xml/html_void_elements.rb ADDED

@@ -0,0 +1,15 @@
+module Oga
+  module XML
+    ##
+    # Names of the HTML void elements that should be handled when HTML lexing
+    # is enabled.
+    #
+    # @api private
+    # @return [Oga::Whitelist]
+    #
+    HTML_VOID_ELEMENTS = Whitelist.new(%w{
+      area base br col command embed hr img input keygen link meta param source
+      track wbr
+    })
+  end # XML
+end # Oga

data/lib/oga/xml/lexer.rb ADDED

@@ -0,0 +1,550 @@
+module Oga
+  module XML
+    ##
+    # Low level lexer that supports both XML and HTML (using an extra option).
+    # To lex HTML input set the `:html` option to `true` when creating an
+    # instance of the lexer:
+    #
+    #     lexer = Oga::XML::Lexer.new(:html => true)
+    #
+    # This lexer can process both String and IO instances. IO instances are
+    # processed on a line by line basis. This can greatly reduce memory usage
+    # in exchange for a slightly slower runtime.
+    #
+    # ## Thread Safety
+    #
+    # Since this class keeps track of an internal state you can not use the
+    # same instance between multiple threads at the same time. For example, the
+    # following will not work reliably:
+    #
+    #     # Don't do this!
+    #     lexer   = Oga::XML::Lexer.new('....')
+    #     threads = []
+    #
+    #     2.times do
+    #       threads << Thread.new do
+    #         lexer.advance do |*args|
+    #           p args
+    #         end
+    #       end
+    #     end
+    #
+    #     threads.each(&:join)
+    #
+    # However, it is perfectly save to use different instances per thread.
+    # There is no _global_ state used by this lexer.
+    #
+    # ## Strict Mode
+    #
+    # By default the lexer is rather permissive regarding the input. For
+    # example, missing closing tags are inserted by default. To disable this
+    # behaviour the lexer can be run in "strict mode" by setting `:strict` to
+    # `true`:
+    #
+    #     lexer = Oga::XML::Lexer.new('...', :strict => true)
+    #
+    # Strict mode only applies to XML documents.
+    #
+    # @private
+    #
+    class Lexer
+      # These are all constant/frozen to remove the need for String allocations
+      # every time they are referenced in the lexer.
+      HTML_SCRIPT = 'script'.freeze
+      HTML_STYLE  = 'style'.freeze
+      # Elements that are allowed directly in a <table> element.
+      HTML_TABLE_ALLOWED = Whitelist.new(
+        %w{thead tbody tfoot tr caption colgroup col}
+      )
+      HTML_SCRIPT_ELEMENTS = Whitelist.new(%w{script template})
+      HTML_TABLE_ROW_ELEMENTS = Whitelist.new(%w{tr}) + HTML_SCRIPT_ELEMENTS
+      # Elements that should be closed automatically before a new opening tag is
+      # processed.
+      HTML_CLOSE_SELF = {
+        'head' => Blacklist.new(%w{head body}),
+        'body' => Blacklist.new(%w{head body}),
+        'li'   => Blacklist.new(%w{li}),
+        'dt'   => Blacklist.new(%w{dt dd}),
+        'dd'   => Blacklist.new(%w{dt dd}),
+        'p'    => Blacklist.new(%w{
+          address article aside blockquote details div dl fieldset figcaption
+          figure footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav
+          ol p pre section table ul
+        }),
+        'rb'       => Blacklist.new(%w{rb rt rtc rp}),
+        'rt'       => Blacklist.new(%w{rb rt rtc rp}),
+        'rtc'      => Blacklist.new(%w{rb rtc}),
+        'rp'       => Blacklist.new(%w{rb rt rtc rp}),
+        'optgroup' => Blacklist.new(%w{optgroup}),
+        'option'   => Blacklist.new(%w{optgroup option}),
+        'colgroup' => Whitelist.new(%w{col template}),
+        'caption'  => HTML_TABLE_ALLOWED.to_blacklist,
+        'table'    => HTML_TABLE_ALLOWED + HTML_SCRIPT_ELEMENTS,
+        'thead'    => HTML_TABLE_ROW_ELEMENTS,
+        'tbody'    => HTML_TABLE_ROW_ELEMENTS,
+        'tfoot'    => HTML_TABLE_ROW_ELEMENTS,
+        'tr'       => Whitelist.new(%w{td th}) + HTML_SCRIPT_ELEMENTS,
+        'td'       => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED,
+        'th'       => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED
+      }
+      HTML_CLOSE_SELF.keys.each do |key|
+        HTML_CLOSE_SELF[key.upcase] = HTML_CLOSE_SELF[key]
+      end
+      ##
+      # Names of HTML tags of which the content should be lexed as-is.
+      #
+      LITERAL_HTML_ELEMENTS = Whitelist.new([HTML_SCRIPT, HTML_STYLE])
+      ##
+      # @param [String|IO] data The data to lex. This can either be a String or
+      #  an IO instance.
+      #
+      # @param [Hash] options
+      #
+      # @option options [TrueClass|FalseClass] :html When set to `true` the
+      #  lexer will treat the input as HTML instead of XML. This makes it
+      #  possible to lex HTML void elements such as `<link href="">`.
+      #
+      # @option options [TrueClass|FalseClass] :strict Enables/disables strict
+      #  parsing of XML documents, disabled by default.
+      #
+      def initialize(data, options = {})
+        @data   = data
+        @html   = options[:html]
+        @strict = options[:strict] || false
+        reset
+      end
+      ##
+      # Resets the internal state of the lexer. Typically you don't need to
+      # call this method yourself as its called by #lex after lexing a given
+      # String.
+      #
+      def reset
+        @line     = 1
+        @elements = []
+        @data.rewind if @data.respond_to?(:rewind)
+        reset_native
+      end
+      ##
+      # Yields the data to lex to the supplied block.
+      #
+      # @return [String]
+      # @yieldparam [String]
+      #
+      def read_data
+        if @data.is_a?(String)
+          yield @data
+        # IO, StringIO, etc
+        # THINK: read(N) would be nice, but currently this screws up the C code
+        elsif @data.respond_to?(:each_line)
+          @data.each_line { |line| yield line }
+        # Enumerator, Array, etc
+        elsif @data.respond_to?(:each)
+          @data.each { |chunk| yield chunk }
+        end
+      end
+      ##
+      # Gathers all the tokens for the input and returns them as an Array.
+      #
+      # This method resets the internal state of the lexer after consuming the
+      # input.
+      #
+      # @see #advance
+      # @return [Array]
+      #
+      def lex
+        tokens = []
+        advance do |type, value, line|
+          tokens << [type, value, line]
+        end
+        reset
+        tokens
+      end
+      ##
+      # Advances through the input and generates the corresponding tokens. Each
+      # token is yielded to the supplied block.
+      #
+      # Each token is an Array in the following format:
+      #
+      #     [TYPE, VALUE]
+      #
+      # The type is a symbol, the value is either nil or a String.
+      #
+      # This method stores the supplied block in `@block` and resets it after
+      # the lexer loop has finished.
+      #
+      # This method does *not* reset the internal state of the lexer.
+      #
+      # @yieldparam [Symbol] type
+      # @yieldparam [String] value
+      # @yieldparam [Fixnum] line
+      #
+      def advance(&block)
+        @block = block
+        read_data do |chunk|
+          advance_native(chunk)
+        end
+        # Add any missing closing tags
+        if !strict? and !@elements.empty?
+          @elements.length.times { on_element_end }
+        end
+      ensure
+        @block = nil
+      end
+      ##
+      # @return [TrueClass|FalseClass]
+      #
+      def html?
+        @html == true
+      end
+      ##
+      # @return [TrueClass|FalseClass]
+      #
+      def strict?
+        @strict
+      end
+      ##
+      # @return [TrueClass|FalseClass]
+      #
+      def html_script?
+        html? && current_element == HTML_SCRIPT
+      end
+      ##
+      # @return [TrueClass|FalseClass]
+      #
+      def html_style?
+        html? && current_element == HTML_STYLE
+      end
+      private
+      ##
+      # @param [Fixnum] amount The amount of lines to advance.
+      #
+      def advance_line(amount = 1)
+        @line += amount
+      end
+      ##
+      # Calls the supplied block with the information of the current token.
+      #
+      # @param [Symbol] type The token type.
+      # @param [String] value The token value.
+      #
+      # @yieldparam [String] type
+      # @yieldparam [String] value
+      # @yieldparam [Fixnum] line
+      #
+      def add_token(type, value = nil)
+        @block.call(type, value, @line)
+      end
+      ##
+      # Returns the name of the element we're currently in.
+      #
+      # @return [String]
+      #
+      def current_element
+        @elements.last
+      end
+      ##
+      # Called when processing a single quote.
+      #
+      def on_string_squote
+        add_token(:T_STRING_SQUOTE)
+      end
+      ##
+      # Called when processing a double quote.
+      #
+      def on_string_dquote
+        add_token(:T_STRING_DQUOTE)
+      end
+      ##
+      # Called when processing the body of a string.
+      #
+      # @param [String] value The data between the quotes.
+      #
+      def on_string_body(value)
+        add_token(:T_STRING_BODY, value)
+      end
+      ##
+      # Called when a doctype starts.
+      #
+      def on_doctype_start
+        add_token(:T_DOCTYPE_START)
+      end
+      ##
+      # Called on the identifier specifying the type of the doctype.
+      #
+      # @param [String] value
+      #
+      def on_doctype_type(value)
+        add_token(:T_DOCTYPE_TYPE, value)
+      end
+      ##
+      # Called on the identifier specifying the name of the doctype.
+      #
+      # @param [String] value
+      #
+      def on_doctype_name(value)
+        add_token(:T_DOCTYPE_NAME, value)
+      end
+      ##
+      # Called on the end of a doctype.
+      #
+      def on_doctype_end
+        add_token(:T_DOCTYPE_END)
+      end
+      ##
+      # Called on an inline doctype block.
+      #
+      # @param [String] value
+      #
+      def on_doctype_inline(value)
+        add_token(:T_DOCTYPE_INLINE, value)
+      end
+      ##
+      # Called on the open CDATA tag.
+      #
+      def on_cdata_start
+        add_token(:T_CDATA_START)
+      end
+      ##
+      # Called on the closing CDATA tag.
+      #
+      def on_cdata_end
+        add_token(:T_CDATA_END)
+      end
+      ##
+      # Called for the body of a CDATA tag.
+      #
+      # @param [String] value
+      #
+      def on_cdata_body(value)
+        add_token(:T_CDATA_BODY, value)
+      end
+      ##
+      # Called on the open comment tag.
+      #
+      def on_comment_start
+        add_token(:T_COMMENT_START)
+      end
+      ##
+      # Called on the closing comment tag.
+      #
+      def on_comment_end
+        add_token(:T_COMMENT_END)
+      end
+      ##
+      # Called on a comment.
+      #
+      # @param [String] value
+      #
+      def on_comment_body(value)
+        add_token(:T_COMMENT_BODY, value)
+      end
+      ##
+      # Called on the start of an XML declaration tag.
+      #
+      def on_xml_decl_start
+        add_token(:T_XML_DECL_START)
+      end
+      ##
+      # Called on the end of an XML declaration tag.
+      #
+      def on_xml_decl_end
+        add_token(:T_XML_DECL_END)
+      end
+      ##
+      # Called on the start of a processing instruction.
+      #
+      def on_proc_ins_start
+        add_token(:T_PROC_INS_START)
+      end
+      ##
+      # Called on a processing instruction name.
+      #
+      # @param [String] value
+      #
+      def on_proc_ins_name(value)
+        add_token(:T_PROC_INS_NAME, value)
+      end
+      ##
+      # Called on the body of a processing instruction.
+      #
+      # @param [String] value
+      #
+      def on_proc_ins_body(value)
+        add_token(:T_PROC_INS_BODY, value)
+      end
+      ##
+      # Called on the end of a processing instruction.
+      #
+      def on_proc_ins_end
+        add_token(:T_PROC_INS_END)
+      end
+      ##
+      # Called on the name of an element.
+      #
+      # @param [String] name The name of the element, including namespace.
+      #
+      def on_element_name(name)
+        before_html_element_name(name) if html?
+        add_element(name)
+      end
+      ##
+      # Handles inserting of any missing tags whenever a new HTML tag is opened.
+      #
+      # @param [String] name
+      #
+      def before_html_element_name(name)
+        close_current = HTML_CLOSE_SELF[current_element]
+        if close_current and !close_current.allow?(name)
+          on_element_end
+        end
+        # Close remaining parent elements. This for example ensures that a
+        # "<tbody>" not only closes an unclosed "<th>" but also the surrounding,
+        # unclosed "<tr>".
+        while close_current = HTML_CLOSE_SELF[current_element]
+          if close_current.allow?(name)
+            break
+          else
+            on_element_end
+          end
+        end
+      end
+      ##
+      # @param [String] name
+      #
+      def add_element(name)
+        @elements << name
+        add_token(:T_ELEM_NAME, name)
+      end
+      ##
+      # Called on the element namespace.
+      #
+      # @param [String] namespace
+      #
+      def on_element_ns(namespace)
+        add_token(:T_ELEM_NS, namespace)
+      end
+      ##
+      # Called on the closing `>` of the open tag of an element.
+      #
+      def on_element_open_end
+        return unless html?
+        # Only downcase the name if we can't find an all lower/upper version of
+        # the element name. This can save us a *lot* of String allocations.
+        if HTML_VOID_ELEMENTS.allow?(current_element) \
+        or HTML_VOID_ELEMENTS.allow?(current_element.downcase)
+          add_token(:T_ELEM_END)
+          @elements.pop
+        end
+      end
+      ##
+      # Called on the closing tag of an element.
+      #
+      # @param [String] name The name of the element (minus namespace
+      #  prefix). This is not set for self closing tags.
+      #
+      def on_element_end(name = nil)
+        return if @elements.empty?
+        if html? and name and @elements.include?(name)
+          while current_element != name
+            add_token(:T_ELEM_END)
+            @elements.pop
+          end
+        end
+        add_token(:T_ELEM_END)
+        @elements.pop
+      end
+      ##
+      # Called on regular text values.
+      #
+      # @param [String] value
+      #
+      def on_text(value)
+        return if value.empty?
+        add_token(:T_TEXT, value)
+      end
+      ##
+      # Called on attribute namespaces.
+      #
+      # @param [String] value
+      #
+      def on_attribute_ns(value)
+        add_token(:T_ATTR_NS, value)
+      end
+      ##
+      # Called on tag attributes.
+      #
+      # @param [String] value
+      #
+      def on_attribute(value)
+        add_token(:T_ATTR, value)
+      end
+    end # Lexer
+  end # XML
+end # Oga