RubyGems - rehtml - Versions diffs - 0.0.1 - Mend

rehtml 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/.gitignore +18 -0
data/.rspec +2 -0
data/.travis.yml +8 -0
data/Gemfile +8 -0
data/LICENSE.txt +22 -0
data/README.md +40 -0
data/Rakefile +7 -0
data/gen_entities.rb +40 -0
data/lib/rehtml.rb +12 -0
data/lib/rehtml/builder.rb +96 -0
data/lib/rehtml/elements.rb +45 -0
data/lib/rehtml/entities.rb +2133 -0
data/lib/rehtml/scanner.rb +22 -0
data/lib/rehtml/tokenizer.rb +130 -0
data/lib/rehtml/version.rb +3 -0
data/rehtml.gemspec +25 -0
data/spec/rehtml_parser_spec.rb +47 -0
data/spec/rehtml_scanner_spec.rb +21 -0
data/spec/rehtml_tokenizer_spec.rb +122 -0
data/spec/spec_helper.rb +12 -0
metadata +127 -0

data/lib/rehtml/scanner.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# -*- encoding: utf-8 -*-
+require 'strscan'
+module REHTML
+  class Scanner < StringScanner
+    def scan_before_or_eos(regex, move_after=false)
+      self.scan_before(regex, true, move_after)
+    end
+    def scan_before(regex, or_eos=false, move_after=false)
+      text = self.scan_until(regex)
+      if text
+        size = self.matched.size
+        self.pos -= size unless move_after
+        return text[0...(-size)]
+      end
+      if or_eos
+        text = self.rest
+        self.terminate
+      end
+      text
+    end
+  end
+end

data/lib/rehtml/tokenizer.rb ADDED Viewed

@@ -0,0 +1,130 @@
+# -*- encoding: utf-8 -*-
+require 'rehtml/scanner'
+require 'rehtml/elements'
+require 'rehtml/entities'
+module REHTML
+  module TokenInfo
+    attr_reader :raw, :start_pos, :end_pos
+    def set_token_info(bpos,scanner)
+      @start_pos=bpos
+      @end_pos= scanner.pos
+      @raw = scanner.string[@start_pos...(@end_pos)]
+    end
+  end
+  class Tokenizer
+    # Create a new Tokenizer for the given text.
+    def initialize(html)
+      @scanner = Scanner.new(html)
+      @bpos = 0
+    end
+    # Return the next token in the sequence, or +nil+ if there are no more tokens in
+    # the stream.
+    def next
+      return nil if @scanner.eos?
+      add_parse_info(@scanner.check(/<\S/) ? scan_element : scan_text)
+    end
+    private
+    def add_parse_info(node)
+      node.extend(TokenInfo)
+      node.set_token_info(@bpos,@scanner)
+      @bpos = @scanner.pos
+      node
+    end
+    def scan_text
+      Text.new(decode("#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"))
+    end
+    # decode html entity
+    def decode(html)
+      html.gsub(ENTITIES::REGEXP){
+        if $1
+          if ENTITIES::MAP[$1]
+            ENTITIES::MAP[$1]
+          else
+            $&
+          end
+        elsif $2
+          [$2.to_i(10)].pack('U')
+        elsif $3
+          [$3.to_i(16)].pack('U')
+        else
+          $&
+        end
+      }
+    end
+    def scan_element
+      if @scanner.scan(/<!--/) # comment
+        comment = @scanner.scan_before_or_eos(/-->/,true)
+        Comment.new(comment)
+      elsif @scanner.scan(/<!\[CDATA\[/)
+        CData.new(@scanner.scan_before_or_eos(/\]\]>/,true))
+      elsif @scanner.scan(/<!DOCTYPE[\x20\x09\x0A\x0C\x0D]+/i)
+        scan_doctype
+      elsif @scanner.scan(/<!/) # comment
+        comment = @scanner.scan_before_or_eos(/>/,true)
+        Comment.new(comment)
+      elsif @scanner.scan(/<\?/) # PI or xml decl
+        scan_pi
+      else
+        scan_tag
+      end
+    end
+    def scan_tag
+      @scanner.scan(/<(\/)?([^\x20\x09\x0A\x0C\x0D>]*)/)
+      is_end = @scanner[1] ? true : false
+      name = @scanner[2]
+      attrs = {}
+      loop do
+        @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
+        attr = @scanner.scan_before_or_eos(/[=>\x20\x09\x0A\x0C\x0D]|\/>/)
+        matched = @scanner.matched
+        if matched == '>' || matched.nil? || matched == '/>'
+          attrs[attr.downcase]="" unless attr.empty?
+          break
+        end
+        @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
+        if @scanner.scan(/=/)
+          @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
+          if @scanner.scan(/['"]/)
+            m = Regexp.compile(Regexp.quote(@scanner.matched))
+            value = @scanner.scan_before_or_eos(m, true)
+          else
+            value = @scanner.scan_before_or_eos(/[>\x20\x09\x0A\x0C\x0D]|\/>/)
+          end
+        else
+          value = ""
+        end
+        attrs[attr.downcase]=decode(value) unless attr.empty?
+      end
+      empty = !@scanner.scan(/\//).nil?
+      @scanner.skip(/>/)
+      if is_end
+        EndTag.new(name.downcase,attrs,empty)
+      else
+        Tag.new(name.downcase,attrs,empty)
+      end
+    end
+    def scan_pi
+      # http://www.w3.org/TR/REC-xml/#NT-Name
+      name = @scanner.scan(/([-:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD0-9\u00B7\u0300-\u036F\u203F-\u2040]+)/) || ""
+      body = @scanner.scan_before_or_eos(/\?>/,true)
+      Instruction.new(name,body)
+    end
+    def scan_doctype
+      # TODO complex doctype
+      # https://github.com/ruby/ruby/blob/master/lib/rexml/parsers/baseparser.rb#L258
+      # source = REXML::Source.new(doctype)
+      # parser = REXML::Parsers::BaseParser.new(soucre)
+      # while parser.document_status == in_doctype
+      #   parser.pull_event
+      doctype = @scanner.scan_before_or_eos(/>/,true)
+      DocType.new
+    end
+  end
+end

data/lib/rehtml/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module REHTML
+  VERSION = "0.0.1"
+end

data/rehtml.gemspec ADDED Viewed

@@ -0,0 +1,25 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'rehtml/version'
+description = open(File.dirname(__FILE__)+"/README.md").read.gsub(/^.*\n(Pure Ruby)/m,'\1').gsub(/\n##.*/m,"")
+Gem::Specification.new do |spec|
+  spec.name          = "rehtml"
+  spec.version       = REHTML::VERSION
+  spec.authors       = ["nazoking"]
+  spec.email         = ["nazoking@gmail.com"]
+  spec.summary       = description.split(/\n/)[0].strip
+  spec.description   = description
+  spec.homepage      = "https://github.com/nazoking/rehtml"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.5"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "rspec"
+end

data/spec/rehtml_parser_spec.rb ADDED Viewed

@@ -0,0 +1,47 @@
+require 'spec_helper'
+require 'rehtml'
+class ReHTML
+  def initialize(str)
+    @str = str
+  end
+  def to_s
+    "parse [#{@str}]"
+  end
+  def to_rexml
+    REHTML.to_rexml(@str).to_s
+  end
+  def doc
+    REHTML.to_rexml(@str)
+  end
+end
+def parse(str)
+  ReHTML.new(str)
+end
+describe parse(%[ <title>html</title> <a>a</a>]) do
+  its(:to_rexml){ should eq(%[ <html><title>html</title> <a>a</a></html>]) }
+end
+describe parse(%[<a>html</a>]) do
+  its(:to_rexml){ should eq(%[<a>html</a>]) }
+end
+describe parse(%[<title>html</title><a>a</a>]) do
+  its(:to_rexml){ should eq(%[<html><title>html</title><a>a</a></html>]) }
+  its("doc.xml_decl.writethis"){ should be_false }
+end
+describe parse(%[  <?xml version="1.0" ?><html><a>a</a>]) do
+  its(:to_rexml){ should eq(%[<?xml version='1.0'?>  <html><a>a</a></html>]) }
+  its("doc.xml_decl.writethis"){ should be_true }
+end
+describe parse(%[<html><a />]) do
+  its(:to_rexml){ should eq(%[<html><a/></html>]) }
+end
+=begin
+  describe %[index.jsp] do
+    it{
+       doc = REHTML.to_rexml(open(File.join(File.dirname(__FILE__),'files','login.jsp')).read)
+       formatter = REXML::Formatters::Pretty.new
+       formatter.write(doc.root, $stdout)
+    }
+  end
+=end

data/spec/rehtml_scanner_spec.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'spec_helper'
+require 'rehtml/scanner'
+describe REHTML::Scanner do
+  describe "scan aabcd" do
+    let(:scanner){ REHTML::Scanner.new("aabcd") }
+    it "scan_before" do
+      expect(scanner.scan_before(/b/)).to eq("aa")
+      expect(scanner.check(/b/)).to eq("b")
+      expect(scanner.scan(/b/)).to eq("b")
+      expect(scanner.scan(/b/)).to eq(nil)
+    end
+    it "scan_before_or_eos" do
+      expect(scanner.scan_before_or_eos(/z/)).to eq("aabcd")
+      expect(scanner.eos?).to eq(true)
+    end
+    it "scan_before_or_eos move_after" do
+      expect(scanner.scan_before_or_eos(/b/,true)).to eq("aa")
+      expect(scanner.rest).to eq("cd")
+    end
+  end
+end

data/spec/rehtml_tokenizer_spec.rb ADDED Viewed

@@ -0,0 +1,122 @@
+# -*- encoding: utf-8 -*-
+require 'spec_helper'
+require 'rehtml/tokenizer'
+class TokenizeHelper
+  def initialize(msg,str=nil);
+    @msg = str.nil? ? "" : " #{msg}"
+    @str = str.nil? ? msg : str
+  end
+  def to_s; "tokenize#{@msg} {#{@str}}"; end
+  def first_token; REHTML::Tokenizer.new(@str).next; end
+  def token_size
+    t = REHTML::Tokenizer.new(@str)
+    i = 0
+    i += 1 until t.next.nil?
+    i
+  end
+  def token(num)
+    t = REHTML::Tokenizer.new(@str)
+    num.times{|ii|
+      token = t.next
+      raise "token size is #{ii}" if token.nil?
+    }
+    t.next
+  end
+  def method_missing(name, *args)
+    if name.to_s =~ /^token(\d+)$/
+      token($1.to_i-1)
+    else
+      first_token.send(name, *args)
+    end
+  end
+end
+def tokenize(msg,str=nil); TokenizeHelper.new(msg,str); end
+describe tokenize(%[<a name="be evil" type='checkbox' value=yes disabled>]) do
+  its("first_token.raw"){ should eq(%[<a name="be evil" type='checkbox' value=yes disabled>]) }
+  its("first_token"){ should be_a(REHTML::Tag) }
+  its(:name){ should eq("a") }
+  its(:attributes){ should eq({
+    "type"=>"checkbox",
+    "name"=>"be evil",
+    "value"=>"yes",
+    "disabled"=>""}) }
+  its(:token_size){ should eq(1) }
+end
+describe tokenize(%[<?xml version="1.0"?>]) do
+  its(:first_token){ should be_a(REHTML::Instruction) }
+  its(:first_token){ should be_is_xml_decl }
+  its(:token_size){ should eq(1) }
+end
+describe tokenize(%[<?php hoge?>]) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::Instruction) }
+  its(:target){ should eq("php") }
+  its(:content){ should eq(" hoge") }
+  it{ should_not be_is_xml_decl }
+end
+describe tokenize(%[<? huga?>]) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::Instruction) }
+  its(:target){ should eq("") }
+  its(:content){ should eq(" huga") }
+  it{ should_not be_is_xml_decl }
+end
+describe tokenize(%{<!-- comment -->}) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::Comment) }
+  its("first_token.string"){ should eq(" comment ") }
+end
+describe tokenize(%{abc &a; &amp; &amp &#x2212; &#39; }) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::Text) }
+  its(:value){ should eq(%[abc &a; & &amp − ' ]) }
+end
+describe tokenize(%{<![CDATA[ cdata ]]>}) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::CData) }
+  its(:value){ should eq(" cdata ") }
+end
+describe tokenize("unclosed comment",%[<!-- comment]) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::Comment) }
+  its(:string){ should eq(" comment") }
+end
+describe tokenize("unclosed tag",%{<A }) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::Tag) }
+  its(:name){ should eq("a") }
+  its(:attributes){ should be_empty }
+end
+describe tokenize(%{<A =A=B ATTR x=">" A =A=B hoge = ' huge}) do
+  its(:first_token){ should be_a(REHTML::Tag) }
+  its(:name){ should eq("a") }
+  its(:attributes){ should eq({"attr"=>"", "hoge"=>" huge", "a"=>"A=B","x"=>">"}) }
+end
+describe tokenize(%{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">}) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::DocType) }
+  its("token1.raw"){ should eq(%{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">}) }
+end
+describe tokenize(%{<!BAD "//www.w3.org/TR/html4/loose.dtd">}) do
+  its(:token_size){ should eq(1) }
+  its(:first_token){ should be_a(REHTML::Comment) }
+  its(:string){ should eq('BAD "//www.w3.org/TR/html4/loose.dtd"') }
+end
+describe tokenize(%[a<b>c</b>d]) do
+  its("token1.raw"){ should eq("a") }
+  its("token1"){ should be_a(REHTML::Text) }
+  its("token1.value"){ should eq("a") }
+  its("token2.raw"){ should eq("<b>") }
+  its("token2"){ should be_a(REHTML::Tag) }
+  its("token2.name"){ should eq("b") }
+  its("token2.attributes"){ should be_empty }
+  its("token3"){ should be_a(REHTML::Text) }
+  its("token3.value"){ should eq("c") }
+  its("token4"){ should be_a(REHTML::EndTag) }
+  its("token4.name"){ should eq("b") }
+  its("token5"){ should be_a(REHTML::Text) }
+  its("token5.value"){ should eq("d") }
+  its("token6"){ should be_nil }
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,12 @@
+$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
+require 'rubygems'
+require 'rehtml'
+begin
+  require 'coveralls'
+  Coveralls.wear!
+rescue LoadError
+end
+require 'rspec/expectations'

metadata ADDED Viewed

@@ -0,0 +1,127 @@
+--- !ruby/object:Gem::Specification
+name: rehtml
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- nazoking
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-05-26 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.5'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.5'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: ! 'Pure Ruby html parser.
+  This library parse html and build rexml document.
+  Nokogiri is very convenient, but the installation is complex because it do I need
+  to build a native library, it is not suitable for chef.
+'
+email:
+- nazoking@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- .travis.yml
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- gen_entities.rb
+- lib/rehtml.rb
+- lib/rehtml/builder.rb
+- lib/rehtml/elements.rb
+- lib/rehtml/entities.rb
+- lib/rehtml/scanner.rb
+- lib/rehtml/tokenizer.rb
+- lib/rehtml/version.rb
+- rehtml.gemspec
+- spec/rehtml_parser_spec.rb
+- spec/rehtml_scanner_spec.rb
+- spec/rehtml_tokenizer_spec.rb
+- spec/spec_helper.rb
+homepage: https://github.com/nazoking/rehtml
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Pure Ruby html parser.
+test_files:
+- spec/rehtml_parser_spec.rb
+- spec/rehtml_scanner_spec.rb
+- spec/rehtml_tokenizer_spec.rb
+- spec/spec_helper.rb