html_cruncher 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6b52f018b73d713860ade94ef48d23806ec5e053
4
+ data.tar.gz: 5d1ddd2d216635668dfed1c2adf19c20cd5c6770
5
+ SHA512:
6
+ metadata.gz: dea287e7ab8d8e911e90adcabf58148863c72dd9469f206e8d17e9f1f2d4e62b1c5e90df8e0ccab17473f5ea9c53f31a5836ea3f6b2798a5a00999be68ca9651
7
+ data.tar.gz: 3bc59bce9a64600f1f7c32331d61441a6d6c046033101bcabdb1e2756bdeada4cd07717273d8116b5ea0cb4146388a382ea64c54440ac27d7d545289dd2ff974
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in html_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Mathias Biilmann Christensen
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # HtmlCruncher
2
+
3
+ Crunches through HTML
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'html_cruncher'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install html_cruncher
20
+
21
+ ## Usage
22
+
23
+ See the tests for usage.
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/html_parser/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/extensiontask"
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ t.pattern = "test/*_test.rb"
8
+ end
9
+
10
+ Rake::ExtensionTask.new "html_parser" do |ext|
11
+ ext.lib_dir = "lib/html_cruncher"
12
+ end
@@ -0,0 +1,9 @@
1
+ require 'mkmf'
2
+
3
+ extension_name = 'html_parser'
4
+
5
+ have_library("re2")
6
+ have_header("re2.h")
7
+
8
+ dir_config(extension_name) # The destination
9
+ create_makefile(extension_name) # Create Makefile
@@ -0,0 +1,199 @@
1
+ #include "ruby.h"
2
+ #include <re2/re2.h>
3
+ #include <iostream>
4
+ #include <string>
5
+ #include <new>
6
+
7
+ using std::nothrow;
8
+ using re2::StringPiece;
9
+
10
+ static VALUE HtmlParser = Qnil;
11
+ static RE2 *WHITESPACE_RE;
12
+ static RE2 *START_TAG_RE;
13
+ static RE2 *ATTRIBUTES_RE;
14
+ static RE2 *END_TAG_RE;
15
+ static StringPiece COMMENT_TAG;
16
+ static StringPiece COMMENT_TAG_END;
17
+ static StringPiece SELF_CLOSER;
18
+
19
+ static void parse_attributes(VALUE attrs, StringPiece data) {
20
+ long offset = 0;
21
+ long length = data.size();
22
+ int ret, n_groups = 0, pos = 0;
23
+ StringPiece *groups = NULL;
24
+ StringPiece value;
25
+
26
+ while (offset < length) {
27
+ // Match Whitespace
28
+ n_groups = WHITESPACE_RE->NumberOfCapturingGroups() + 1;
29
+ groups = new(nothrow) StringPiece[n_groups];
30
+ ret = WHITESPACE_RE->Match(data, offset, length, RE2::ANCHOR_START, groups, n_groups);
31
+ if (ret == 1) {
32
+ offset += groups[0].size();
33
+ }
34
+ delete[] groups;
35
+
36
+ n_groups = ATTRIBUTES_RE->NumberOfCapturingGroups() + 1;
37
+ groups = new(nothrow) StringPiece[n_groups];
38
+ ret = ATTRIBUTES_RE->Match(
39
+ data,
40
+ offset,
41
+ length,
42
+ RE2::ANCHOR_START,
43
+ groups,
44
+ n_groups
45
+ );
46
+ if (ret == 1) {
47
+ if (groups[2].size()) {
48
+ value = groups[2];
49
+ } else if (groups[3].size()) {
50
+ value = groups[3];
51
+ } else if (groups[4].size()) {
52
+ value = groups[4];
53
+ } else {
54
+ value = StringPiece("");
55
+ }
56
+ rb_hash_aset(
57
+ attrs,
58
+ rb_str_new(groups[1].data(), groups[1].size()),
59
+ rb_str_new(value.data(), value.size())
60
+ );
61
+ offset += groups[0].size();
62
+ delete[] groups;
63
+ } else {
64
+ delete[] groups;
65
+ return;
66
+ }
67
+ }
68
+ }
69
+
70
+ extern "C" VALUE
71
+ parse_html(VALUE self, VALUE string, VALUE handler)
72
+ {
73
+ long offset = 0;
74
+ long length = RSTRING_LEN(string);
75
+ int tag_match = 0;
76
+ int ret, n_groups = 0, pos = 0;
77
+ StringPiece html = StringPiece(RSTRING_PTR(string), length);
78
+ StringPiece *groups = NULL;
79
+
80
+ ID m_text = rb_intern("text");
81
+ ID m_whitespace = rb_intern("whitespace");
82
+ ID m_start_tag = rb_intern("start_tag");
83
+ ID m_end_tag = rb_intern("end_tag");
84
+ ID m_done = rb_intern("done");
85
+
86
+ while (offset < length) {
87
+ tag_match = 0;
88
+ ret = 0;
89
+ n_groups = WHITESPACE_RE->NumberOfCapturingGroups() + 1;
90
+ groups = new(nothrow) StringPiece[n_groups];
91
+
92
+ // Match Whitespace
93
+ ret = WHITESPACE_RE->Match(html, offset, length, RE2::ANCHOR_START, groups, n_groups);
94
+ if (ret == 1) {
95
+ tag_match = 1;
96
+ rb_funcall(handler, m_whitespace, 1, rb_str_new(groups[0].data(), groups[0].size()));
97
+ offset += groups[0].size();
98
+ }
99
+ delete[] groups;
100
+
101
+ // Match Comment
102
+ if (html.substr(offset, COMMENT_TAG.size()) == COMMENT_TAG) {
103
+ tag_match = 1;
104
+ pos = html.find(COMMENT_TAG_END, offset + COMMENT_TAG.size());
105
+ rb_funcall(
106
+ handler,
107
+ m_text,
108
+ 1,
109
+ rb_str_new(html.data()+offset, pos == -1 ? length-offset : (pos - offset) + COMMENT_TAG_END.size())
110
+ );
111
+ offset = pos == -1 ? length : pos + COMMENT_TAG_END.size();
112
+ }
113
+
114
+ // Match Start tag
115
+ if (tag_match == 0) {
116
+ n_groups = START_TAG_RE->NumberOfCapturingGroups() + 1;
117
+ groups = new(nothrow) StringPiece[n_groups];
118
+ ret = START_TAG_RE->Match(html, offset, length, RE2::ANCHOR_START, groups, n_groups);
119
+ if (ret == 1) {
120
+ tag_match = 1;
121
+ VALUE attrs = rb_hash_new();
122
+
123
+ if (groups[2].size() > 0) {
124
+ parse_attributes(attrs, groups[2]);
125
+ }
126
+
127
+ rb_funcall(
128
+ handler,
129
+ m_start_tag,
130
+ 3,
131
+ rb_str_new(groups[1].data(), groups[1].size()),
132
+ attrs,
133
+ rb_str_new(groups[0].data(), groups[0].size())
134
+ );
135
+ offset += groups[0].size();
136
+ if (groups[0].ends_with(SELF_CLOSER)) {
137
+ rb_funcall(
138
+ handler,
139
+ m_end_tag,
140
+ 2,
141
+ rb_str_new(groups[1].data(), groups[1].size()),
142
+ rb_str_new("", 0)
143
+ );
144
+ }
145
+ }
146
+ delete[] groups;
147
+ }
148
+
149
+ // Match end tag
150
+ if (tag_match == 0) {
151
+ n_groups = END_TAG_RE->NumberOfCapturingGroups() + 1;
152
+ groups = new(nothrow) StringPiece[n_groups];
153
+ ret = END_TAG_RE->Match(html, offset, length, RE2::ANCHOR_START, groups, n_groups);
154
+ if (ret == 1) {
155
+ tag_match = 1;
156
+ rb_funcall(
157
+ handler,
158
+ m_end_tag,
159
+ 2,
160
+ rb_str_new(groups[1].data(), groups[1].size()),
161
+ rb_str_new(groups[0].data(), groups[0].size())
162
+ );
163
+ offset += groups[0].size();
164
+ }
165
+ delete[] groups;
166
+ }
167
+
168
+ // Match text
169
+ if (tag_match == 0) {
170
+ pos = html.find('<', offset + 1);
171
+ rb_funcall(
172
+ handler,
173
+ m_text,
174
+ 1,
175
+ rb_str_new(html.data()+offset, pos == -1 ? length-offset : pos-offset)
176
+ );
177
+ offset = pos == -1 ? length : pos;
178
+ }
179
+ }
180
+
181
+ rb_funcall(handler, m_done, 0);
182
+
183
+ return handler;
184
+ }
185
+
186
+ extern "C" void
187
+ Init_html_parser()
188
+ {
189
+ WHITESPACE_RE = new(nothrow) RE2("\\s+");
190
+ START_TAG_RE = new(nothrow) RE2("<(\\w+[a-zA-Z0-9_\\.-]*)((?:\\s+[\\w-]+(?:\\s*=\\s*(?:(?:\"[^\"]*\")|(?:'[^']*')|(?:[^'\">\\s]*)))?)*)\\s*\\/?>");
191
+ END_TAG_RE = new(nothrow) RE2("<\\/(\\w+[a-zA-Z0-9_:\\.-]*)\\s*>");
192
+ ATTRIBUTES_RE = new(nothrow) RE2("([\\w_-]+)(?:\\s*=\\s*(?:(?:\"((?:\\.|[^\"])*)\")|(?:'((?:\\.|[^'])*)')|([^'\">\\s]*)))?");
193
+ COMMENT_TAG = StringPiece("<!--");
194
+ COMMENT_TAG_END = StringPiece("-->");
195
+ SELF_CLOSER = StringPiece("/>");
196
+
197
+ HtmlParser = rb_const_get(rb_cObject, rb_intern("HtmlCruncher"));
198
+ rb_define_method(HtmlParser, "parse_html", (VALUE (*)(...))parse_html, 2);
199
+ }
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'html_cruncher/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "html_cruncher"
8
+ spec.version = HtmlCruncher::VERSION
9
+ spec.authors = ["Mathias Biilmann Christensen"]
10
+ spec.email = ["info@mathias-biilmann.net"]
11
+ spec.summary = %q{Will crunch through HTML and trigger callbacks on tags and text}
12
+ spec.description = %q{HTML Cruncher is a streaming parser that'll gladly crunch through anything mildly resembling HTML.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib", "ext"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rake-compiler", "~> 0.9"
24
+
25
+ spec.extensions = %w{ext/html_parser/extconf.rb}
26
+ end
@@ -0,0 +1,3 @@
1
+ class HtmlCruncher
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,15 @@
1
+ require "html_cruncher/version"
2
+ require "html_cruncher/html_parser"
3
+
4
+ class HtmlCruncher
5
+ def initialize(handler)
6
+ @handler = handler
7
+ end
8
+
9
+ def crunch(source)
10
+ parse_html(source.dup.freeze, @handler)
11
+ end
12
+ alias :parse :crunch
13
+
14
+ private :parse_html
15
+ end
@@ -0,0 +1,98 @@
1
+ require 'minitest/autorun'
2
+ require 'html_cruncher'
3
+
4
+
5
+ class TestHtmlCruncher < MiniTest::Unit::TestCase
6
+ class Handler
7
+ attr_reader :stack
8
+
9
+ def initialize
10
+ @stack = []
11
+ end
12
+
13
+ def whitespace(w)
14
+ stack << Whitespace.new(w)
15
+ end
16
+
17
+ def text(t)
18
+ stack << Text.new(t)
19
+ end
20
+
21
+ def start_tag(t, a, r)
22
+ stack << StartTag.new(r, t, a)
23
+ end
24
+
25
+ def end_tag(t, r)
26
+ stack << EndTag.new(r, t)
27
+ end
28
+
29
+ def done
30
+ self
31
+ end
32
+
33
+ class Whitespace < Struct.new(:content); end
34
+ class Text < Struct.new(:content); end
35
+ class StartTag < Struct.new(:content, :tag, :attributes); end
36
+ class EndTag < Struct.new(:content, :tag); end
37
+ end
38
+
39
+ def parser
40
+ HtmlCruncher.new(Handler.new)
41
+ end
42
+
43
+ def test_html_with_whitespace
44
+ html = %[ Hello]
45
+ result = parser.parse(html)
46
+
47
+ assert_equal 2, result.stack.length
48
+ assert_equal Handler::Whitespace, result.stack.first.class
49
+ assert_equal Handler::Text, result.stack.last.class
50
+ assert_equal "Hello", result.stack.last.content
51
+ end
52
+
53
+ def test_html_with_simple_tag
54
+ html = %[<title>Hello</title>]
55
+ result = parser.parse(html)
56
+ assert_equal [Handler::StartTag, Handler::Text, Handler::EndTag], result.stack.map(&:class)
57
+ assert_equal "<title>", result.stack.first.content
58
+ end
59
+
60
+ def test_html_with_selfclosing_tag
61
+ html = %[<img src="img.png"/>]
62
+ result = parser.parse(html)
63
+ assert_equal Handler::StartTag, result.stack.first.class
64
+ assert_equal Handler::EndTag, result.stack.last.class
65
+ assert_equal html, result.stack.map(&:content).join
66
+ assert_equal({"src" => "img.png"}, result.stack.first.attributes)
67
+ end
68
+
69
+ def test_a_comment_tag
70
+ html = %[<!-- Hello --> hello]
71
+ result = parser.parse(html)
72
+ assert_equal [Handler::Text, Handler::Whitespace, Handler::Text], result.stack.map(&:class)
73
+ assert_equal ["<!-- Hello -->", " ", "hello"], result.stack.map(&:content)
74
+ end
75
+
76
+ def test_html_with_doctype
77
+ html = %[<!doctype html>
78
+ <html>
79
+ <head><title>Test</title></head>
80
+ <body><h1>This is the body</h1></body>
81
+ </html>]
82
+ result = parser.parse(html)
83
+
84
+ assert_equal html, result.stack.map(&:content).join
85
+ end
86
+
87
+ def test_unclosed_comment_tag
88
+ html = %[<!doctype html>
89
+ <html>
90
+ <!-- Comment!
91
+ <head><title>Test</title></head>
92
+ <body><h1>This is the body</h1></body>
93
+ </html>]
94
+ result = parser.parse(html)
95
+
96
+ assert_equal html, result.stack.map(&:content).join
97
+ end
98
+ end
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_cruncher
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Mathias Biilmann Christensen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.9'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.9'
55
+ description: HTML Cruncher is a streaming parser that'll gladly crunch through anything
56
+ mildly resembling HTML.
57
+ email:
58
+ - info@mathias-biilmann.net
59
+ executables: []
60
+ extensions:
61
+ - ext/html_parser/extconf.rb
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ".gitignore"
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - README.md
68
+ - Rakefile
69
+ - ext/html_parser/extconf.rb
70
+ - ext/html_parser/html_parser.cc
71
+ - html_parser.gemspec
72
+ - lib/html_cruncher.rb
73
+ - lib/html_cruncher/version.rb
74
+ - test/html_cruncher_test.rb
75
+ homepage: ''
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ - ext
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ requirements: []
95
+ rubyforge_project:
96
+ rubygems_version: 2.2.2
97
+ signing_key:
98
+ specification_version: 4
99
+ summary: Will crunch through HTML and trigger callbacks on tags and text
100
+ test_files:
101
+ - test/html_cruncher_test.rb
102
+ has_rdoc: