rehtml 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'strscan'
3
+ module REHTML
4
+ class Scanner < StringScanner
5
+ def scan_before_or_eos(regex, move_after=false)
6
+ self.scan_before(regex, true, move_after)
7
+ end
8
+ def scan_before(regex, or_eos=false, move_after=false)
9
+ text = self.scan_until(regex)
10
+ if text
11
+ size = self.matched.size
12
+ self.pos -= size unless move_after
13
+ return text[0...(-size)]
14
+ end
15
+ if or_eos
16
+ text = self.rest
17
+ self.terminate
18
+ end
19
+ text
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,130 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rehtml/scanner'
3
+ require 'rehtml/elements'
4
+ require 'rehtml/entities'
5
+
6
+ module REHTML
7
+ module TokenInfo
8
+ attr_reader :raw, :start_pos, :end_pos
9
+ def set_token_info(bpos,scanner)
10
+ @start_pos=bpos
11
+ @end_pos= scanner.pos
12
+ @raw = scanner.string[@start_pos...(@end_pos)]
13
+ end
14
+ end
15
+ class Tokenizer
16
+ # Create a new Tokenizer for the given text.
17
+ def initialize(html)
18
+ @scanner = Scanner.new(html)
19
+ @bpos = 0
20
+ end
21
+
22
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
23
+ # the stream.
24
+ def next
25
+ return nil if @scanner.eos?
26
+ add_parse_info(@scanner.check(/<\S/) ? scan_element : scan_text)
27
+ end
28
+
29
+ private
30
+ def add_parse_info(node)
31
+ node.extend(TokenInfo)
32
+ node.set_token_info(@bpos,@scanner)
33
+ @bpos = @scanner.pos
34
+ node
35
+ end
36
+
37
+ def scan_text
38
+ Text.new(decode("#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"))
39
+ end
40
+
41
+ # decode html entity
42
+ def decode(html)
43
+ html.gsub(ENTITIES::REGEXP){
44
+ if $1
45
+ if ENTITIES::MAP[$1]
46
+ ENTITIES::MAP[$1]
47
+ else
48
+ $&
49
+ end
50
+ elsif $2
51
+ [$2.to_i(10)].pack('U')
52
+ elsif $3
53
+ [$3.to_i(16)].pack('U')
54
+ else
55
+ $&
56
+ end
57
+ }
58
+ end
59
+
60
+ def scan_element
61
+ if @scanner.scan(/<!--/) # comment
62
+ comment = @scanner.scan_before_or_eos(/-->/,true)
63
+ Comment.new(comment)
64
+ elsif @scanner.scan(/<!\[CDATA\[/)
65
+ CData.new(@scanner.scan_before_or_eos(/\]\]>/,true))
66
+ elsif @scanner.scan(/<!DOCTYPE[\x20\x09\x0A\x0C\x0D]+/i)
67
+ scan_doctype
68
+ elsif @scanner.scan(/<!/) # comment
69
+ comment = @scanner.scan_before_or_eos(/>/,true)
70
+ Comment.new(comment)
71
+ elsif @scanner.scan(/<\?/) # PI or xml decl
72
+ scan_pi
73
+ else
74
+ scan_tag
75
+ end
76
+ end
77
+
78
+ def scan_tag
79
+ @scanner.scan(/<(\/)?([^\x20\x09\x0A\x0C\x0D>]*)/)
80
+ is_end = @scanner[1] ? true : false
81
+ name = @scanner[2]
82
+ attrs = {}
83
+ loop do
84
+ @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
85
+ attr = @scanner.scan_before_or_eos(/[=>\x20\x09\x0A\x0C\x0D]|\/>/)
86
+ matched = @scanner.matched
87
+ if matched == '>' || matched.nil? || matched == '/>'
88
+ attrs[attr.downcase]="" unless attr.empty?
89
+ break
90
+ end
91
+ @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
92
+ if @scanner.scan(/=/)
93
+ @scanner.skip(/[\x20\x09\x0A\x0C\x0D]/)
94
+ if @scanner.scan(/['"]/)
95
+ m = Regexp.compile(Regexp.quote(@scanner.matched))
96
+ value = @scanner.scan_before_or_eos(m, true)
97
+ else
98
+ value = @scanner.scan_before_or_eos(/[>\x20\x09\x0A\x0C\x0D]|\/>/)
99
+ end
100
+ else
101
+ value = ""
102
+ end
103
+ attrs[attr.downcase]=decode(value) unless attr.empty?
104
+ end
105
+ empty = !@scanner.scan(/\//).nil?
106
+ @scanner.skip(/>/)
107
+ if is_end
108
+ EndTag.new(name.downcase,attrs,empty)
109
+ else
110
+ Tag.new(name.downcase,attrs,empty)
111
+ end
112
+ end
113
+ def scan_pi
114
+ # http://www.w3.org/TR/REC-xml/#NT-Name
115
+ name = @scanner.scan(/([-:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD0-9\u00B7\u0300-\u036F\u203F-\u2040]+)/) || ""
116
+ body = @scanner.scan_before_or_eos(/\?>/,true)
117
+ Instruction.new(name,body)
118
+ end
119
+ def scan_doctype
120
+ # TODO complex doctype
121
+ # https://github.com/ruby/ruby/blob/master/lib/rexml/parsers/baseparser.rb#L258
122
+ # source = REXML::Source.new(doctype)
123
+ # parser = REXML::Parsers::BaseParser.new(soucre)
124
+ # while parser.document_status == in_doctype
125
+ # parser.pull_event
126
+ doctype = @scanner.scan_before_or_eos(/>/,true)
127
+ DocType.new
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,3 @@
1
+ module REHTML
2
+ VERSION = "0.0.1"
3
+ end
data/rehtml.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'rehtml/version'
5
+ description = open(File.dirname(__FILE__)+"/README.md").read.gsub(/^.*\n(Pure Ruby)/m,'\1').gsub(/\n##.*/m,"")
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "rehtml"
9
+ spec.version = REHTML::VERSION
10
+ spec.authors = ["nazoking"]
11
+ spec.email = ["nazoking@gmail.com"]
12
+ spec.summary = description.split(/\n/)[0].strip
13
+ spec.description = description
14
+ spec.homepage = "https://github.com/nazoking/rehtml"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0")
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.5"
23
+ spec.add_development_dependency "rake"
24
+ spec.add_development_dependency "rspec"
25
+ end
@@ -0,0 +1,47 @@
1
+ require 'spec_helper'
2
+ require 'rehtml'
3
+
4
+ class ReHTML
5
+ def initialize(str)
6
+ @str = str
7
+ end
8
+ def to_s
9
+ "parse [#{@str}]"
10
+ end
11
+ def to_rexml
12
+ REHTML.to_rexml(@str).to_s
13
+ end
14
+ def doc
15
+ REHTML.to_rexml(@str)
16
+ end
17
+ end
18
+ def parse(str)
19
+ ReHTML.new(str)
20
+ end
21
+
22
+ describe parse(%[ <title>html</title> <a>a</a>]) do
23
+ its(:to_rexml){ should eq(%[ <html><title>html</title> <a>a</a></html>]) }
24
+ end
25
+ describe parse(%[<a>html</a>]) do
26
+ its(:to_rexml){ should eq(%[<a>html</a>]) }
27
+ end
28
+ describe parse(%[<title>html</title><a>a</a>]) do
29
+ its(:to_rexml){ should eq(%[<html><title>html</title><a>a</a></html>]) }
30
+ its("doc.xml_decl.writethis"){ should be_false }
31
+ end
32
+ describe parse(%[ <?xml version="1.0" ?><html><a>a</a>]) do
33
+ its(:to_rexml){ should eq(%[<?xml version='1.0'?> <html><a>a</a></html>]) }
34
+ its("doc.xml_decl.writethis"){ should be_true }
35
+ end
36
+ describe parse(%[<html><a />]) do
37
+ its(:to_rexml){ should eq(%[<html><a/></html>]) }
38
+ end
39
+ =begin
40
+ describe %[index.jsp] do
41
+ it{
42
+ doc = REHTML.to_rexml(open(File.join(File.dirname(__FILE__),'files','login.jsp')).read)
43
+ formatter = REXML::Formatters::Pretty.new
44
+ formatter.write(doc.root, $stdout)
45
+ }
46
+ end
47
+ =end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+ require 'rehtml/scanner'
3
+ describe REHTML::Scanner do
4
+ describe "scan aabcd" do
5
+ let(:scanner){ REHTML::Scanner.new("aabcd") }
6
+ it "scan_before" do
7
+ expect(scanner.scan_before(/b/)).to eq("aa")
8
+ expect(scanner.check(/b/)).to eq("b")
9
+ expect(scanner.scan(/b/)).to eq("b")
10
+ expect(scanner.scan(/b/)).to eq(nil)
11
+ end
12
+ it "scan_before_or_eos" do
13
+ expect(scanner.scan_before_or_eos(/z/)).to eq("aabcd")
14
+ expect(scanner.eos?).to eq(true)
15
+ end
16
+ it "scan_before_or_eos move_after" do
17
+ expect(scanner.scan_before_or_eos(/b/,true)).to eq("aa")
18
+ expect(scanner.rest).to eq("cd")
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,122 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'spec_helper'
3
+ require 'rehtml/tokenizer'
4
+
5
+ class TokenizeHelper
6
+ def initialize(msg,str=nil);
7
+ @msg = str.nil? ? "" : " #{msg}"
8
+ @str = str.nil? ? msg : str
9
+ end
10
+ def to_s; "tokenize#{@msg} {#{@str}}"; end
11
+ def first_token; REHTML::Tokenizer.new(@str).next; end
12
+ def token_size
13
+ t = REHTML::Tokenizer.new(@str)
14
+ i = 0
15
+ i += 1 until t.next.nil?
16
+ i
17
+ end
18
+ def token(num)
19
+ t = REHTML::Tokenizer.new(@str)
20
+ num.times{|ii|
21
+ token = t.next
22
+ raise "token size is #{ii}" if token.nil?
23
+ }
24
+ t.next
25
+ end
26
+ def method_missing(name, *args)
27
+ if name.to_s =~ /^token(\d+)$/
28
+ token($1.to_i-1)
29
+ else
30
+ first_token.send(name, *args)
31
+ end
32
+ end
33
+ end
34
+ def tokenize(msg,str=nil); TokenizeHelper.new(msg,str); end
35
+
36
+ describe tokenize(%[<a name="be evil" type='checkbox' value=yes disabled>]) do
37
+ its("first_token.raw"){ should eq(%[<a name="be evil" type='checkbox' value=yes disabled>]) }
38
+ its("first_token"){ should be_a(REHTML::Tag) }
39
+ its(:name){ should eq("a") }
40
+ its(:attributes){ should eq({
41
+ "type"=>"checkbox",
42
+ "name"=>"be evil",
43
+ "value"=>"yes",
44
+ "disabled"=>""}) }
45
+ its(:token_size){ should eq(1) }
46
+ end
47
+ describe tokenize(%[<?xml version="1.0"?>]) do
48
+ its(:first_token){ should be_a(REHTML::Instruction) }
49
+ its(:first_token){ should be_is_xml_decl }
50
+ its(:token_size){ should eq(1) }
51
+ end
52
+ describe tokenize(%[<?php hoge?>]) do
53
+ its(:token_size){ should eq(1) }
54
+ its(:first_token){ should be_a(REHTML::Instruction) }
55
+ its(:target){ should eq("php") }
56
+ its(:content){ should eq(" hoge") }
57
+ it{ should_not be_is_xml_decl }
58
+ end
59
+ describe tokenize(%[<? huga?>]) do
60
+ its(:token_size){ should eq(1) }
61
+ its(:first_token){ should be_a(REHTML::Instruction) }
62
+ its(:target){ should eq("") }
63
+ its(:content){ should eq(" huga") }
64
+ it{ should_not be_is_xml_decl }
65
+ end
66
+ describe tokenize(%{<!-- comment -->}) do
67
+ its(:token_size){ should eq(1) }
68
+ its(:first_token){ should be_a(REHTML::Comment) }
69
+ its("first_token.string"){ should eq(" comment ") }
70
+ end
71
+ describe tokenize(%{abc &a; &amp; &amp &#x2212; &#39; }) do
72
+ its(:token_size){ should eq(1) }
73
+ its(:first_token){ should be_a(REHTML::Text) }
74
+ its(:value){ should eq(%[abc &a; & &amp − ' ]) }
75
+ end
76
+ describe tokenize(%{<![CDATA[ cdata ]]>}) do
77
+ its(:token_size){ should eq(1) }
78
+ its(:first_token){ should be_a(REHTML::CData) }
79
+ its(:value){ should eq(" cdata ") }
80
+ end
81
+ describe tokenize("unclosed comment",%[<!-- comment]) do
82
+ its(:token_size){ should eq(1) }
83
+ its(:first_token){ should be_a(REHTML::Comment) }
84
+ its(:string){ should eq(" comment") }
85
+ end
86
+ describe tokenize("unclosed tag",%{<A }) do
87
+ its(:token_size){ should eq(1) }
88
+ its(:first_token){ should be_a(REHTML::Tag) }
89
+ its(:name){ should eq("a") }
90
+ its(:attributes){ should be_empty }
91
+ end
92
+ describe tokenize(%{<A =A=B ATTR x=">" A =A=B hoge = ' huge}) do
93
+ its(:first_token){ should be_a(REHTML::Tag) }
94
+ its(:name){ should eq("a") }
95
+ its(:attributes){ should eq({"attr"=>"", "hoge"=>" huge", "a"=>"A=B","x"=>">"}) }
96
+ end
97
+ describe tokenize(%{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">}) do
98
+ its(:token_size){ should eq(1) }
99
+ its(:first_token){ should be_a(REHTML::DocType) }
100
+ its("token1.raw"){ should eq(%{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">}) }
101
+ end
102
+ describe tokenize(%{<!BAD "//www.w3.org/TR/html4/loose.dtd">}) do
103
+ its(:token_size){ should eq(1) }
104
+ its(:first_token){ should be_a(REHTML::Comment) }
105
+ its(:string){ should eq('BAD "//www.w3.org/TR/html4/loose.dtd"') }
106
+ end
107
+ describe tokenize(%[a<b>c</b>d]) do
108
+ its("token1.raw"){ should eq("a") }
109
+ its("token1"){ should be_a(REHTML::Text) }
110
+ its("token1.value"){ should eq("a") }
111
+ its("token2.raw"){ should eq("<b>") }
112
+ its("token2"){ should be_a(REHTML::Tag) }
113
+ its("token2.name"){ should eq("b") }
114
+ its("token2.attributes"){ should be_empty }
115
+ its("token3"){ should be_a(REHTML::Text) }
116
+ its("token3.value"){ should eq("c") }
117
+ its("token4"){ should be_a(REHTML::EndTag) }
118
+ its("token4.name"){ should eq("b") }
119
+ its("token5"){ should be_a(REHTML::Text) }
120
+ its("token5.value"){ should eq("d") }
121
+ its("token6"){ should be_nil }
122
+ end
@@ -0,0 +1,12 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'rubygems'
3
+ require 'rehtml'
4
+ begin
5
+ require 'coveralls'
6
+ Coveralls.wear!
7
+ rescue LoadError
8
+ end
9
+
10
+ require 'rspec/expectations'
11
+
12
+
metadata ADDED
@@ -0,0 +1,127 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rehtml
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - nazoking
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-05-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.5'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.5'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: ! 'Pure Ruby html parser.
63
+
64
+
65
+ This library parse html and build rexml document.
66
+
67
+
68
+ Nokogiri is very convenient, but the installation is complex because it do I need
69
+ to build a native library, it is not suitable for chef.
70
+
71
+ '
72
+ email:
73
+ - nazoking@gmail.com
74
+ executables: []
75
+ extensions: []
76
+ extra_rdoc_files: []
77
+ files:
78
+ - .gitignore
79
+ - .rspec
80
+ - .travis.yml
81
+ - Gemfile
82
+ - LICENSE.txt
83
+ - README.md
84
+ - Rakefile
85
+ - gen_entities.rb
86
+ - lib/rehtml.rb
87
+ - lib/rehtml/builder.rb
88
+ - lib/rehtml/elements.rb
89
+ - lib/rehtml/entities.rb
90
+ - lib/rehtml/scanner.rb
91
+ - lib/rehtml/tokenizer.rb
92
+ - lib/rehtml/version.rb
93
+ - rehtml.gemspec
94
+ - spec/rehtml_parser_spec.rb
95
+ - spec/rehtml_scanner_spec.rb
96
+ - spec/rehtml_tokenizer_spec.rb
97
+ - spec/spec_helper.rb
98
+ homepage: https://github.com/nazoking/rehtml
99
+ licenses:
100
+ - MIT
101
+ post_install_message:
102
+ rdoc_options: []
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ none: false
113
+ requirements:
114
+ - - ! '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 1.8.24
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: Pure Ruby html parser.
123
+ test_files:
124
+ - spec/rehtml_parser_spec.rb
125
+ - spec/rehtml_scanner_spec.rb
126
+ - spec/rehtml_tokenizer_spec.rb
127
+ - spec/spec_helper.rb