RubyGems - oga - Versions diffs - 0.1.1-java - Mend

oga 0.1.1-java

Files changed (47) hide show

checksums.yaml +7 -0
data/.yardopts +13 -0
data/LICENSE +19 -0
data/README.md +179 -0
data/doc/DCO.md +25 -0
data/doc/changelog.md +20 -0
data/doc/css/common.css +76 -0
data/doc/migrating_from_nokogiri.md +169 -0
data/ext/c/extconf.rb +13 -0
data/ext/c/lexer.c +1518 -0
data/ext/c/lexer.h +8 -0
data/ext/c/lexer.rl +121 -0
data/ext/c/liboga.c +6 -0
data/ext/c/liboga.h +11 -0
data/ext/java/Liboga.java +14 -0
data/ext/java/org/liboga/xml/Lexer.java +829 -0
data/ext/java/org/liboga/xml/Lexer.rl +151 -0
data/ext/ragel/base_lexer.rl +323 -0
data/lib/liboga.jar +0 -0
data/lib/oga.rb +43 -0
data/lib/oga/html/parser.rb +25 -0
data/lib/oga/oga.rb +27 -0
data/lib/oga/version.rb +3 -0
data/lib/oga/xml/attribute.rb +111 -0
data/lib/oga/xml/cdata.rb +17 -0
data/lib/oga/xml/character_node.rb +39 -0
data/lib/oga/xml/comment.rb +17 -0
data/lib/oga/xml/doctype.rb +84 -0
data/lib/oga/xml/document.rb +99 -0
data/lib/oga/xml/element.rb +331 -0
data/lib/oga/xml/lexer.rb +399 -0
data/lib/oga/xml/namespace.rb +42 -0
data/lib/oga/xml/node.rb +168 -0
data/lib/oga/xml/node_set.rb +313 -0
data/lib/oga/xml/parser.rb +556 -0
data/lib/oga/xml/processing_instruction.rb +39 -0
data/lib/oga/xml/pull_parser.rb +180 -0
data/lib/oga/xml/querying.rb +32 -0
data/lib/oga/xml/text.rb +11 -0
data/lib/oga/xml/traversal.rb +48 -0
data/lib/oga/xml/xml_declaration.rb +69 -0
data/lib/oga/xpath/evaluator.rb +1748 -0
data/lib/oga/xpath/lexer.rb +2043 -0
data/lib/oga/xpath/node.rb +10 -0
data/lib/oga/xpath/parser.rb +537 -0
data/oga.gemspec +45 -0
metadata +221 -0

data/ext/java/org/liboga/xml/Lexer.rl ADDED Viewed

@@ -0,0 +1,151 @@
+package org.liboga.xml;
+%%machine java_lexer;
+import java.io.IOException;
+import org.jcodings.Encoding;
+import org.jruby.Ruby;
+import org.jruby.RubyModule;
+import org.jruby.RubyClass;
+import org.jruby.RubyObject;
+import org.jruby.RubyString;
+import org.jruby.RubyFixnum;
+import org.jruby.util.ByteList;
+import org.jruby.anno.JRubyClass;
+import org.jruby.anno.JRubyMethod;
+import org.jruby.runtime.ThreadContext;
+import org.jruby.runtime.ObjectAllocator;
+import org.jruby.runtime.builtin.IRubyObject;
+/**
+ * Lexer support class for JRuby.
+ *
+ * The Lexer class contains the raw Ragel loop and calls back in to Ruby land
+ * whenever a Ragel action is needed similar to the C extension setup.
+ *
+ * This class requires Ruby land to first define the `Oga::XML` namespace.
+ */
+@JRubyClass(name="Oga::XML::Lexer", parent="Object")
+public class Lexer extends RubyObject
+{
+    /**
+     * The current Ruby runtime.
+     */
+    private Ruby runtime;
+    %% write data;
+    /* Used by Ragel to keep track of the current state. */
+    int act;
+    int cs;
+    /**
+     * Sets up the current class in the Ruby runtime.
+     */
+    public static void load(Ruby runtime)
+    {
+        RubyModule xml = (RubyModule) runtime.getModule("Oga")
+            .getConstant("XML");
+        RubyClass lexer = xml.defineClassUnder(
+            "Lexer",
+            runtime.getObject(),
+            ALLOCATOR
+        );
+        lexer.defineAnnotatedMethods(Lexer.class);
+    }
+    private static final ObjectAllocator ALLOCATOR = new ObjectAllocator()
+    {
+        public IRubyObject allocate(Ruby runtime, RubyClass klass)
+        {
+            return new org.liboga.xml.Lexer(runtime, klass);
+        }
+    };
+    public Lexer(Ruby runtime, RubyClass klass)
+    {
+        super(runtime, klass);
+        this.runtime = runtime;
+    }
+    /**
+     * Runs the bulk of the Ragel loop and calls back in to Ruby.
+     *
+     * This method pulls its data in from the instance variable `@data`. The
+     * Ruby side of the Lexer class should set this variable to a String in its
+     * constructor method. Encodings are passed along to make sure that token
+     * values share the same encoding as the input.
+     *
+     * This method always returns nil.
+     */
+    @JRubyMethod
+    public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
+    {
+        Encoding encoding = rb_str.getEncoding();
+        byte[] data = rb_str.getBytes();
+        int ts   = 0;
+        int te   = 0;
+        int p    = 0;
+        int mark = 0;
+        int pe   = data.length;
+        int eof  = data.length;
+        %% write exec;
+        return context.nil;
+    }
+    /**
+     * Resets the internal state of the lexer.
+     */
+    @JRubyMethod
+    public IRubyObject reset_native(ThreadContext context)
+    {
+        this.act = 0;
+        this.cs  = java_lexer_start;
+        return context.nil;
+    }
+    /**
+     * Calls back in to Ruby land passing the current token value along.
+     *
+     * This method calls back in to Ruby land based on the method name
+     * specified in `name`. The Ruby callback should take one argument. This
+     * argument will be a String containing the value of the current token.
+     */
+    public void callback(String name, byte[] data, Encoding enc, int ts, int te)
+    {
+        ByteList bytelist = new ByteList(data, ts, te - ts, enc, true);
+        RubyString value = this.runtime.newString(bytelist);
+        ThreadContext context = this.runtime.getCurrentContext();
+        this.callMethod(context, name, value);
+    }
+    /**
+     * Calls back in to Ruby land without passing any arguments.
+     */
+    public void callback_simple(String name)
+    {
+        ThreadContext context = this.runtime.getCurrentContext();
+        this.callMethod(context, name);
+    }
+}
+%%{
+    variable act this.act;
+    variable cs this.cs;
+    include base_lexer "base_lexer.rl";
+}%%

data/ext/ragel/base_lexer.rl ADDED Viewed

@@ -0,0 +1,323 @@
+%%machine base_lexer;
+%%{
+    ##
+    # Base grammar for the XML lexer.
+    #
+    # This grammar is shared between the C and Java extensions. As a result of
+    # this you should **not** include language specific code in Ragel
+    # actions/callbacks.
+    #
+    # To call back in to Ruby you can use one of the following two functions:
+    #
+    # * callback
+    # * callback_simple
+    #
+    # The first function takes 5 arguments:
+    #
+    # * The name of the Ruby method to call.
+    # * The input data.
+    # * The encoding of the input data.
+    # * The start of the current buffer.
+    # * The end of the current buffer.
+    #
+    # The function callback_simple only takes one argument: the name of the
+    # method to call. This function should be used for callbacks that don't
+    # require any values.
+    #
+    # When you call a method in Ruby make sure that said method is defined as
+    # an instance method in the `Oga::XML::Lexer` class.
+    #
+    # ## Machine Transitions
+    #
+    # To transition from one machine to another always use `fnext` instead of
+    # `fcall` and `fret`. This removes the need for the code to keep track of a
+    # stack.
+    #
+    newline    = '\n' | '\r\n';
+    whitespace = [ \t];
+    ident_char = [a-zA-Z0-9\-_];
+    identifier = ident_char+;
+    # Comments
+    #
+    # http://www.w3.org/TR/html-markup/syntax.html#comments
+    #
+    # Unlike the W3 specification these rules *do* allow character sequences
+    # such as `--` and `->`. Putting extra checks in for these sequences would
+    # actually make the rules/actions more complex.
+    #
+    comment_start = '<!--';
+    comment_end   = '-->';
+    comment       = comment_start (any* -- comment_end) comment_end;
+    action start_comment {
+        callback("on_comment", data, encoding, ts + 4, te - 3);
+    }
+    # CDATA
+    #
+    # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
+    #
+    # In HTML CDATA tags have no meaning/are not supported. Oga does
+    # support them but treats their contents as plain text.
+    #
+    cdata_start = '<![CDATA[';
+    cdata_end   = ']]>';
+    cdata       = cdata_start (any* -- cdata_end) cdata_end;
+    action start_cdata {
+        callback("on_cdata", data, encoding, ts + 9, te - 3);
+    }
+    # Processing Instructions
+    #
+    # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
+    # http://en.wikipedia.org/wiki/Processing_Instruction
+    #
+    # These are tags meant to be used by parsers/libraries for custom behaviour.
+    # One example are the tags used by PHP: <?php and ?>. Note that the XML
+    # declaration tags (<?xml ?>) are not considered to be a processing
+    # instruction.
+    #
+    proc_ins_start = '<?' identifier;
+    proc_ins_end   = '?>';
+    action start_proc_ins {
+        callback_simple("on_proc_ins_start");
+        callback("on_proc_ins_name", data, encoding, ts + 2, te);
+        mark = te;
+        fnext proc_ins_body;
+    }
+    proc_ins_body := |*
+        proc_ins_end => {
+            callback("on_text", data, encoding, mark, ts);
+            callback_simple("on_proc_ins_end");
+            fnext main;
+        };
+        any;
+    *|;
+    # Strings
+    #
+    # Strings in HTML can either be single or double quoted. If a string
+    # starts with one of these quotes it must be closed with the same type
+    # of quote.
+    #
+    dquote = '"';
+    squote = "'";
+    string_dquote = (dquote ^dquote* dquote);
+    string_squote = (squote ^squote* squote);
+    string = string_dquote | string_squote;
+    action emit_string {
+        callback("on_string", data, encoding, ts + 1, te - 1);
+    }
+    # DOCTYPES
+    #
+    # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
+    #
+    # These rules support the 3 flavours of doctypes:
+    #
+    # 1. Normal doctypes, as introduced in the HTML5 specification.
+    # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
+    # 3. Legacy doctypes
+    #
+    doctype_start = '<!DOCTYPE'i whitespace+;
+    action start_doctype {
+        callback_simple("on_doctype_start");
+        fnext doctype;
+    }
+    # Machine for processing doctypes. Doctype values such as the public
+    # and system IDs are treated as T_STRING tokens.
+    doctype := |*
+        'PUBLIC' | 'SYSTEM' => {
+            callback("on_doctype_type", data, encoding, ts, te);
+        };
+        # Consumes everything between the [ and ]. Due to the use of :> the ]
+        # is not consumed by any+.
+        '[' any+ :> ']' => {
+            callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
+        };
+        # Lex the public/system IDs as regular strings.
+        string => emit_string;
+        # Whitespace inside doctypes is ignored since there's no point in
+        # including it.
+        whitespace;
+        identifier => {
+            callback("on_doctype_name", data, encoding, ts, te);
+        };
+        '>' => {
+            callback_simple("on_doctype_end");
+            fnext main;
+        };
+    *|;
+    # XML declaration tags
+    #
+    # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
+    #
+    xml_decl_start = '<?xml';
+    xml_decl_end   = '?>';
+    action start_xml_decl {
+        callback_simple("on_xml_decl_start");
+        fnext xml_decl;
+    }
+    # Machine that processes the contents of an XML declaration tag.
+    xml_decl := |*
+        xml_decl_end => {
+            callback_simple("on_xml_decl_end");
+            fnext main;
+        };
+        # Attributes and their values (e.g. version="1.0").
+        identifier => {
+            callback("on_attribute", data, encoding, ts, te);
+        };
+        string => emit_string;
+        any;
+    *|;
+    # Elements
+    #
+    # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
+    #
+    # Lexing of elements is broken up into different machines that handle the
+    # name/namespace, contents of the open tag and the body of an element. The
+    # body of an element is lexed using the `main` machine.
+    #
+    element_start = '<' ident_char;
+    element_end   = '</' identifier (':' identifier)* '>';
+    action start_element {
+        callback_simple("on_element_start");
+        fhold;
+        fnext element_name;
+    }
+    action close_element {
+        callback_simple("on_element_end");
+    }
+    # Machine used for lexing the name/namespace of an element.
+    element_name := |*
+        identifier ':' => {
+            callback("on_element_ns", data, encoding, ts, te - 1);
+        };
+        identifier => {
+            callback("on_element_name", data, encoding, ts, te);
+            fnext element_head;
+        };
+    *|;
+    # Machine used for processing the contents of an element's starting tag.
+    # This includes the name, namespace and attributes.
+    element_head := |*
+        whitespace | '=';
+        newline => {
+            callback_simple("advance_line");
+        };
+        # Attribute names and namespaces.
+        identifier ':' => {
+            callback("on_attribute_ns", data, encoding, ts, te - 1);
+        };
+        identifier => {
+            callback("on_attribute", data, encoding, ts, te);
+        };
+        # Attribute values.
+        string => emit_string;
+        # We're done with the open tag of the element.
+        '>' => {
+            callback_simple("on_element_open_end");
+            fnext main;
+        };
+        # Self closing tags.
+        '/>' => {
+            callback_simple("on_element_end");
+            fnext main;
+        };
+    *|;
+    # Text
+    #
+    # http://www.w3.org/TR/xml/#syntax
+    # http://www.w3.org/TR/html-markup/syntax.html#text-syntax
+    #
+    # Text content is everything leading up to certain special tags such as "</"
+    # and "<?".
+    action start_text {
+        fhold;
+        fnext text;
+    }
+    # These characters terminate a T_TEXT sequence and instruct Ragel to jump
+    # back to the main machine.
+    #
+    # Note that this only works if each sequence is exactly 2 characters
+    # long. Because of this "<!" is used instead of "<!--".
+    terminate_text = '</' | '<!' | '<?' | element_start;
+    allowed_text   = any* -- terminate_text;
+    text := |*
+        # Text followed by a special tag, such as "foo<!--"
+        allowed_text @{ mark = p; } terminate_text => {
+            callback("on_text", data, encoding, ts, mark);
+            p    = mark - 1;
+            mark = 0;
+            fnext main;
+        };
+        # Just regular text.
+        allowed_text => {
+            callback("on_text", data, encoding, ts, te);
+            fnext main;
+        };
+    *|;
+    # The main machine aka the entry point of Ragel.
+    main := |*
+        doctype_start  => start_doctype;
+        xml_decl_start => start_xml_decl;
+        comment        => start_comment;
+        cdata          => start_cdata;
+        proc_ins_start => start_proc_ins;
+        element_start  => start_element;
+        element_end    => close_element;
+        any            => start_text;
+    *|;
+}%%