html_cruncher 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6b52f018b73d713860ade94ef48d23806ec5e053
4
+ data.tar.gz: 5d1ddd2d216635668dfed1c2adf19c20cd5c6770
5
+ SHA512:
6
+ metadata.gz: dea287e7ab8d8e911e90adcabf58148863c72dd9469f206e8d17e9f1f2d4e62b1c5e90df8e0ccab17473f5ea9c53f31a5836ea3f6b2798a5a00999be68ca9651
7
+ data.tar.gz: 3bc59bce9a64600f1f7c32331d61441a6d6c046033101bcabdb1e2756bdeada4cd07717273d8116b5ea0cb4146388a382ea64c54440ac27d7d545289dd2ff974
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in html_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Mathias Biilmann Christensen
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # HtmlCruncher
2
+
3
+ Crunches through HTML
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'html_cruncher'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install html_cruncher
20
+
21
+ ## Usage
22
+
23
+ See the tests for usage.
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/html_parser/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/extensiontask"
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ t.pattern = "test/*_test.rb"
8
+ end
9
+
10
+ Rake::ExtensionTask.new "html_parser" do |ext|
11
+ ext.lib_dir = "lib/html_cruncher"
12
+ end
@@ -0,0 +1,9 @@
1
+ require 'mkmf'
2
+
3
+ extension_name = 'html_parser'
4
+
5
+ have_library("re2")
6
+ have_header("re2.h")
7
+
8
+ dir_config(extension_name) # The destination
9
+ create_makefile(extension_name) # Create Makefile
@@ -0,0 +1,199 @@
1
+ #include "ruby.h"
2
+ #include <re2/re2.h>
3
+ #include <iostream>
4
+ #include <string>
5
+ #include <new>
6
+
7
+ using std::nothrow;
8
+ using re2::StringPiece;
9
+
10
+ static VALUE HtmlParser = Qnil;
11
+ static RE2 *WHITESPACE_RE;
12
+ static RE2 *START_TAG_RE;
13
+ static RE2 *ATTRIBUTES_RE;
14
+ static RE2 *END_TAG_RE;
15
+ static StringPiece COMMENT_TAG;
16
+ static StringPiece COMMENT_TAG_END;
17
+ static StringPiece SELF_CLOSER;
18
+
19
+ static void parse_attributes(VALUE attrs, StringPiece data) {
20
+ long offset = 0;
21
+ long length = data.size();
22
+ int ret, n_groups = 0, pos = 0;
23
+ StringPiece *groups = NULL;
24
+ StringPiece value;
25
+
26
+ while (offset < length) {
27
+ // Match Whitespace
28
+ n_groups = WHITESPACE_RE->NumberOfCapturingGroups() + 1;
29
+ groups = new(nothrow) StringPiece[n_groups];
30
+ ret = WHITESPACE_RE->Match(data, offset, length, RE2::ANCHOR_START, groups, n_groups);
31
+ if (ret == 1) {
32
+ offset += groups[0].size();
33
+ }
34
+ delete[] groups;
35
+
36
+ n_groups = ATTRIBUTES_RE->NumberOfCapturingGroups() + 1;
37
+ groups = new(nothrow) StringPiece[n_groups];
38
+ ret = ATTRIBUTES_RE->Match(
39
+ data,
40
+ offset,
41
+ length,
42
+ RE2::ANCHOR_START,
43
+ groups,
44
+ n_groups
45
+ );
46
+ if (ret == 1) {
47
+ if (groups[2].size()) {
48
+ value = groups[2];
49
+ } else if (groups[3].size()) {
50
+ value = groups[3];
51
+ } else if (groups[4].size()) {
52
+ value = groups[4];
53
+ } else {
54
+ value = StringPiece("");
55
+ }
56
+ rb_hash_aset(
57
+ attrs,
58
+ rb_str_new(groups[1].data(), groups[1].size()),
59
+ rb_str_new(value.data(), value.size())
60
+ );
61
+ offset += groups[0].size();
62
+ delete[] groups;
63
+ } else {
64
+ delete[] groups;
65
+ return;
66
+ }
67
+ }
68
+ }
69
+
70
+ extern "C" VALUE
71
+ parse_html(VALUE self, VALUE string, VALUE handler)
72
+ {
73
+ long offset = 0;
74
+ long length = RSTRING_LEN(string);
75
+ int tag_match = 0;
76
+ int ret, n_groups = 0, pos = 0;
77
+ StringPiece html = StringPiece(RSTRING_PTR(string), length);
78
+ StringPiece *groups = NULL;
79
+
80
+ ID m_text = rb_intern("text");
81
+ ID m_whitespace = rb_intern("whitespace");
82
+ ID m_start_tag = rb_intern("start_tag");
83
+ ID m_end_tag = rb_intern("end_tag");
84
+ ID m_done = rb_intern("done");
85
+
86
+ while (offset < length) {
87
+ tag_match = 0;
88
+ ret = 0;
89
+ n_groups = WHITESPACE_RE->NumberOfCapturingGroups() + 1;
90
+ groups = new(nothrow) StringPiece[n_groups];
91
+
92
+ // Match Whitespace
93
+ ret = WHITESPACE_RE->Match(html, offset, length, RE2::ANCHOR_START, groups, n_groups);
94
+ if (ret == 1) {
95
+ tag_match = 1;
96
+ rb_funcall(handler, m_whitespace, 1, rb_str_new(groups[0].data(), groups[0].size()));
97
+ offset += groups[0].size();
98
+ }
99
+ delete[] groups;
100
+
101
+ // Match Comment
102
+ if (html.substr(offset, COMMENT_TAG.size()) == COMMENT_TAG) {
103
+ tag_match = 1;
104
+ pos = html.find(COMMENT_TAG_END, offset + COMMENT_TAG.size());
105
+ rb_funcall(
106
+ handler,
107
+ m_text,
108
+ 1,
109
+ rb_str_new(html.data()+offset, pos == -1 ? length-offset : (pos - offset) + COMMENT_TAG_END.size())
110
+ );
111
+ offset = pos == -1 ? length : pos + COMMENT_TAG_END.size();
112
+ }
113
+
114
+ // Match Start tag
115
+ if (tag_match == 0) {
116
+ n_groups = START_TAG_RE->NumberOfCapturingGroups() + 1;
117
+ groups = new(nothrow) StringPiece[n_groups];
118
+ ret = START_TAG_RE->Match(html, offset, length, RE2::ANCHOR_START, groups, n_groups);
119
+ if (ret == 1) {
120
+ tag_match = 1;
121
+ VALUE attrs = rb_hash_new();
122
+
123
+ if (groups[2].size() > 0) {
124
+ parse_attributes(attrs, groups[2]);
125
+ }
126
+
127
+ rb_funcall(
128
+ handler,
129
+ m_start_tag,
130
+ 3,
131
+ rb_str_new(groups[1].data(), groups[1].size()),
132
+ attrs,
133
+ rb_str_new(groups[0].data(), groups[0].size())
134
+ );
135
+ offset += groups[0].size();
136
+ if (groups[0].ends_with(SELF_CLOSER)) {
137
+ rb_funcall(
138
+ handler,
139
+ m_end_tag,
140
+ 2,
141
+ rb_str_new(groups[1].data(), groups[1].size()),
142
+ rb_str_new("", 0)
143
+ );
144
+ }
145
+ }
146
+ delete[] groups;
147
+ }
148
+
149
+ // Match end tag
150
+ if (tag_match == 0) {
151
+ n_groups = END_TAG_RE->NumberOfCapturingGroups() + 1;
152
+ groups = new(nothrow) StringPiece[n_groups];
153
+ ret = END_TAG_RE->Match(html, offset, length, RE2::ANCHOR_START, groups, n_groups);
154
+ if (ret == 1) {
155
+ tag_match = 1;
156
+ rb_funcall(
157
+ handler,
158
+ m_end_tag,
159
+ 2,
160
+ rb_str_new(groups[1].data(), groups[1].size()),
161
+ rb_str_new(groups[0].data(), groups[0].size())
162
+ );
163
+ offset += groups[0].size();
164
+ }
165
+ delete[] groups;
166
+ }
167
+
168
+ // Match text
169
+ if (tag_match == 0) {
170
+ pos = html.find('<', offset + 1);
171
+ rb_funcall(
172
+ handler,
173
+ m_text,
174
+ 1,
175
+ rb_str_new(html.data()+offset, pos == -1 ? length-offset : pos-offset)
176
+ );
177
+ offset = pos == -1 ? length : pos;
178
+ }
179
+ }
180
+
181
+ rb_funcall(handler, m_done, 0);
182
+
183
+ return handler;
184
+ }
185
+
186
+ extern "C" void
187
+ Init_html_parser()
188
+ {
189
+ WHITESPACE_RE = new(nothrow) RE2("\\s+");
190
+ START_TAG_RE = new(nothrow) RE2("<(\\w+[a-zA-Z0-9_\\.-]*)((?:\\s+[\\w-]+(?:\\s*=\\s*(?:(?:\"[^\"]*\")|(?:'[^']*')|(?:[^'\">\\s]*)))?)*)\\s*\\/?>");
191
+ END_TAG_RE = new(nothrow) RE2("<\\/(\\w+[a-zA-Z0-9_:\\.-]*)\\s*>");
192
+ ATTRIBUTES_RE = new(nothrow) RE2("([\\w_-]+)(?:\\s*=\\s*(?:(?:\"((?:\\.|[^\"])*)\")|(?:'((?:\\.|[^'])*)')|([^'\">\\s]*)))?");
193
+ COMMENT_TAG = StringPiece("<!--");
194
+ COMMENT_TAG_END = StringPiece("-->");
195
+ SELF_CLOSER = StringPiece("/>");
196
+
197
+ HtmlParser = rb_const_get(rb_cObject, rb_intern("HtmlCruncher"));
198
+ rb_define_method(HtmlParser, "parse_html", (VALUE (*)(...))parse_html, 2);
199
+ }
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'html_cruncher/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "html_cruncher"
8
+ spec.version = HtmlCruncher::VERSION
9
+ spec.authors = ["Mathias Biilmann Christensen"]
10
+ spec.email = ["info@mathias-biilmann.net"]
11
+ spec.summary = %q{Will crunch through HTML and trigger callbacks on tags and text}
12
+ spec.description = %q{HTML Cruncher is a streaming parser that'll gladly crunch through anything mildly resembling HTML.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib", "ext"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rake-compiler", "~> 0.9"
24
+
25
+ spec.extensions = %w{ext/html_parser/extconf.rb}
26
+ end
@@ -0,0 +1,3 @@
1
+ class HtmlCruncher
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,15 @@
1
+ require "html_cruncher/version"
2
+ require "html_cruncher/html_parser"
3
+
4
+ class HtmlCruncher
5
+ def initialize(handler)
6
+ @handler = handler
7
+ end
8
+
9
+ def crunch(source)
10
+ parse_html(source.dup.freeze, @handler)
11
+ end
12
+ alias :parse :crunch
13
+
14
+ private :parse_html
15
+ end
@@ -0,0 +1,98 @@
1
+ require 'minitest/autorun'
2
+ require 'html_cruncher'
3
+
4
+
5
+ class TestHtmlCruncher < MiniTest::Unit::TestCase
6
+ class Handler
7
+ attr_reader :stack
8
+
9
+ def initialize
10
+ @stack = []
11
+ end
12
+
13
+ def whitespace(w)
14
+ stack << Whitespace.new(w)
15
+ end
16
+
17
+ def text(t)
18
+ stack << Text.new(t)
19
+ end
20
+
21
+ def start_tag(t, a, r)
22
+ stack << StartTag.new(r, t, a)
23
+ end
24
+
25
+ def end_tag(t, r)
26
+ stack << EndTag.new(r, t)
27
+ end
28
+
29
+ def done
30
+ self
31
+ end
32
+
33
+ class Whitespace < Struct.new(:content); end
34
+ class Text < Struct.new(:content); end
35
+ class StartTag < Struct.new(:content, :tag, :attributes); end
36
+ class EndTag < Struct.new(:content, :tag); end
37
+ end
38
+
39
+ def parser
40
+ HtmlCruncher.new(Handler.new)
41
+ end
42
+
43
+ def test_html_with_whitespace
44
+ html = %[ Hello]
45
+ result = parser.parse(html)
46
+
47
+ assert_equal 2, result.stack.length
48
+ assert_equal Handler::Whitespace, result.stack.first.class
49
+ assert_equal Handler::Text, result.stack.last.class
50
+ assert_equal "Hello", result.stack.last.content
51
+ end
52
+
53
+ def test_html_with_simple_tag
54
+ html = %[<title>Hello</title>]
55
+ result = parser.parse(html)
56
+ assert_equal [Handler::StartTag, Handler::Text, Handler::EndTag], result.stack.map(&:class)
57
+ assert_equal "<title>", result.stack.first.content
58
+ end
59
+
60
+ def test_html_with_selfclosing_tag
61
+ html = %[<img src="img.png"/>]
62
+ result = parser.parse(html)
63
+ assert_equal Handler::StartTag, result.stack.first.class
64
+ assert_equal Handler::EndTag, result.stack.last.class
65
+ assert_equal html, result.stack.map(&:content).join
66
+ assert_equal({"src" => "img.png"}, result.stack.first.attributes)
67
+ end
68
+
69
+ def test_a_comment_tag
70
+ html = %[<!-- Hello --> hello]
71
+ result = parser.parse(html)
72
+ assert_equal [Handler::Text, Handler::Whitespace, Handler::Text], result.stack.map(&:class)
73
+ assert_equal ["<!-- Hello -->", " ", "hello"], result.stack.map(&:content)
74
+ end
75
+
76
+ def test_html_with_doctype
77
+ html = %[<!doctype html>
78
+ <html>
79
+ <head><title>Test</title></head>
80
+ <body><h1>This is the body</h1></body>
81
+ </html>]
82
+ result = parser.parse(html)
83
+
84
+ assert_equal html, result.stack.map(&:content).join
85
+ end
86
+
87
+ def test_unclosed_comment_tag
88
+ html = %[<!doctype html>
89
+ <html>
90
+ <!-- Comment!
91
+ <head><title>Test</title></head>
92
+ <body><h1>This is the body</h1></body>
93
+ </html>]
94
+ result = parser.parse(html)
95
+
96
+ assert_equal html, result.stack.map(&:content).join
97
+ end
98
+ end
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_cruncher
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Mathias Biilmann Christensen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.9'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.9'
55
+ description: HTML Cruncher is a streaming parser that'll gladly crunch through anything
56
+ mildly resembling HTML.
57
+ email:
58
+ - info@mathias-biilmann.net
59
+ executables: []
60
+ extensions:
61
+ - ext/html_parser/extconf.rb
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ".gitignore"
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - README.md
68
+ - Rakefile
69
+ - ext/html_parser/extconf.rb
70
+ - ext/html_parser/html_parser.cc
71
+ - html_parser.gemspec
72
+ - lib/html_cruncher.rb
73
+ - lib/html_cruncher/version.rb
74
+ - test/html_cruncher_test.rb
75
+ homepage: ''
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ - ext
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ requirements: []
95
+ rubyforge_project:
96
+ rubygems_version: 2.2.2
97
+ signing_key:
98
+ specification_version: 4
99
+ summary: Will crunch through HTML and trigger callbacks on tags and text
100
+ test_files:
101
+ - test/html_cruncher_test.rb
102
+ has_rdoc: