minihtml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,191 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdint.h>
4
+ #include <string.h>
5
+
6
+ #define DEFINE_REUSABLE_SYMBOL(name) static ID id_type_##name; static VALUE sym_##name;
7
+ #define INITIALIZE_REUSABLE_SYMBOL(name) id_type_##name = rb_intern(#name); sym_##name = ID2SYM(id_type_##name);
8
+
9
+ DEFINE_REUSABLE_SYMBOL(whitespace);
10
+ DEFINE_REUSABLE_SYMBOL(eof);
11
+ DEFINE_REUSABLE_SYMBOL(kind);
12
+
13
+ typedef struct {
14
+ VALUE tokens;
15
+ int tokens_idx;
16
+ long tokens_len;
17
+ VALUE look[2];
18
+ int marks[128];
19
+ int marks_idx;
20
+ } stream_t;
21
+
22
+ static void stream_free(void *ptr) {
23
+ xfree(ptr);
24
+ }
25
+
26
+ static size_t stream_memsize(const void *ptr) {
27
+ return sizeof(stream_t);
28
+ }
29
+
30
+ static void stream_mark(void *ptr) {
31
+ const stream_t *s = ptr;
32
+ if (s->tokens) rb_gc_mark(s->tokens);
33
+ }
34
+
35
+ static const rb_data_type_t stream_type = {
36
+ "MiniHTML::CSS::TokenStream",
37
+ {stream_mark, stream_free, stream_memsize,},
38
+ 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
39
+ };
40
+
41
+ static VALUE stream_alloc(const VALUE klass) {
42
+ stream_t *s = ALLOC(stream_t);
43
+ memset(s, 0, sizeof(stream_t));
44
+ return TypedData_Wrap_Struct(klass, &stream_type, s);
45
+ }
46
+
47
+ static VALUE stream_initialize(const VALUE self, VALUE tokens) {
48
+ Check_Type(tokens, T_ARRAY);
49
+ stream_t *s;
50
+ TypedData_Get_Struct(self, stream_t, &stream_type, s);
51
+ s->tokens = tokens;
52
+ s->tokens_idx = 0;
53
+ s->marks_idx = 0;
54
+ s->tokens_len = RARRAY_LEN(tokens);
55
+
56
+ // prime lookahead
57
+ s->look[0] = s->tokens_len >= 1 ? rb_ary_entry(tokens, 0) : Qnil;
58
+ s->look[1] = s->tokens_len >= 2 ? rb_ary_entry(tokens, 1) : Qnil;
59
+ return self;
60
+ }
61
+
62
+ static void rotate(stream_t *s) {
63
+ s->look[0] = s->look[1];
64
+ s->look[1] = s->tokens_idx + 1 < s->tokens_len ? rb_ary_entry(s->tokens, s->tokens_idx + 1) : Qnil;
65
+ }
66
+
67
+ #define UNWRAP_STREAM stream_t *s; TypedData_Get_Struct(self, stream_t, &stream_type, s);
68
+
69
+ static VALUE stream_peek(const VALUE self) {
70
+ UNWRAP_STREAM;
71
+ return s->look[0];
72
+ }
73
+
74
+ static VALUE stream_peek1(const VALUE self) {
75
+ UNWRAP_STREAM;
76
+ return s->look[1];
77
+ }
78
+
79
+ static void stream_consume_c(stream_t *s) {
80
+ if (s->tokens_idx < s->tokens_len && s->tokens_idx + 1 < s->tokens_len) {
81
+ s->tokens_idx++;
82
+ }
83
+ rotate(s);
84
+ }
85
+
86
+ static VALUE stream_consume(const VALUE self) {
87
+ UNWRAP_STREAM;
88
+ const VALUE peek = s->look[0];
89
+ stream_consume_c(s);
90
+ return peek;
91
+ }
92
+
93
+ static VALUE stream_discard(const VALUE self) {
94
+ UNWRAP_STREAM;
95
+ stream_consume_c(s);
96
+ return Qnil;
97
+ }
98
+
99
+ static VALUE stream_create_mark(const VALUE self) {
100
+ UNWRAP_STREAM;
101
+ if (s->marks_idx >= 127) {
102
+ rb_raise(rb_eRuntimeError, "Too many marks in stream");
103
+ }
104
+
105
+ s->marks[s->marks_idx++] = s->tokens_idx;
106
+ return Qnil;
107
+ }
108
+
109
+ static VALUE stream_mark_restore(const VALUE self) {
110
+ UNWRAP_STREAM;
111
+ if (s->marks_idx == 0) {
112
+ rb_raise(rb_eRuntimeError, "BUG: No mark to restore");
113
+ }
114
+ const int mark = s->marks[s->marks_idx - 1];
115
+ s->tokens_idx = mark;
116
+ s->look[0] = s->tokens_len > mark ? rb_ary_entry(s->tokens, mark) : Qnil;
117
+ s->look[1] = s->tokens_len > mark + 1 ? rb_ary_entry(s->tokens, mark + 1) : Qnil;
118
+ s->marks_idx--;
119
+ return Qnil;
120
+ }
121
+
122
+ static VALUE stream_mark_pop(const VALUE self) {
123
+ UNWRAP_STREAM;
124
+ if (s->marks_idx == 0) {
125
+ rb_raise(rb_eRuntimeError, "BUG: No mark to pop");
126
+ }
127
+ s->marks_idx--;
128
+ return Qnil;
129
+ }
130
+
131
+ static inline bool is_eof(const VALUE v) { return v == Qnil; }
132
+
133
+ static VALUE stream_is_empty(const VALUE self) {
134
+ UNWRAP_STREAM;
135
+ if (s->tokens_idx >= s->tokens_len || is_eof(s->look[0]))
136
+ return Qtrue;
137
+ return Qfalse;
138
+ }
139
+
140
+ static VALUE stream_peek_kind(const VALUE self) {
141
+ UNWRAP_STREAM;
142
+ if (s->look[0] == Qnil) return Qnil;
143
+ const VALUE k = rb_hash_aref(s->look[0], sym_kind);
144
+ Check_Type(k, T_SYMBOL);
145
+ return k;
146
+ }
147
+
148
+ static VALUE stream_peek_kind1(const VALUE self) {
149
+ UNWRAP_STREAM;
150
+ if (s->look[1] == Qnil) return Qnil;
151
+ const VALUE k = rb_hash_aref(s->look[1], sym_kind);
152
+ Check_Type(k, T_SYMBOL);
153
+ return k;
154
+ }
155
+
156
+ static VALUE stream_status(const VALUE self) {
157
+ UNWRAP_STREAM;
158
+ const VALUE h = rb_hash_new();
159
+ rb_hash_aset(h, ID2SYM(rb_intern("eof")), Qnil);
160
+ rb_hash_aset(h, ID2SYM(rb_intern("tokens")), s->tokens);
161
+ rb_hash_aset(h, ID2SYM(rb_intern("tokens_idx")), INT2NUM(s->tokens_idx));
162
+ rb_hash_aset(h, ID2SYM(rb_intern("tokens_len")), INT2NUM((int)s->tokens_len));
163
+ rb_hash_aset(h, ID2SYM(rb_intern("look0")), s->look[0]);
164
+ rb_hash_aset(h, ID2SYM(rb_intern("look1")), s->look[1]);
165
+ rb_hash_aset(h, ID2SYM(rb_intern("marks_idx")), INT2NUM(s->marks_idx));
166
+ return h;
167
+ }
168
+
169
+ RUBY_FUNC_EXPORTED void Init_minihtml_token_stream(void) {
170
+ const VALUE mMiniHTML = rb_define_module("MiniHTML");
171
+ const VALUE cStream = rb_define_class_under(mMiniHTML, "TokenStream", rb_cObject);
172
+
173
+ INITIALIZE_REUSABLE_SYMBOL(whitespace);
174
+ INITIALIZE_REUSABLE_SYMBOL(eof);
175
+ INITIALIZE_REUSABLE_SYMBOL(kind);
176
+
177
+ rb_define_alloc_func(cStream, stream_alloc);
178
+ rb_define_method(cStream, "initialize", stream_initialize, 1);
179
+ rb_define_method(cStream, "peek", stream_peek, 0);
180
+ rb_define_method(cStream, "peek1", stream_peek1, 0);
181
+ rb_define_method(cStream, "peek_kind", stream_peek_kind, 0);
182
+ rb_define_method(cStream, "peek_kind1", stream_peek_kind1, 0);
183
+ rb_define_method(cStream, "consume", stream_consume, 0);
184
+ rb_define_method(cStream, "discard", stream_discard, 0);
185
+
186
+ rb_define_method(cStream, "mark", stream_create_mark, 0);
187
+ rb_define_method(cStream, "restore", stream_mark_restore, 0);
188
+ rb_define_method(cStream, "pop", stream_mark_pop, 0);
189
+ rb_define_method(cStream, "empty?", stream_is_empty, 0);
190
+ rb_define_method(cStream, "status", stream_status, 0);
191
+ }
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class Attr < Base
6
+ attr_accessor :name
7
+ attr_reader :value
8
+
9
+ def initialize(token)
10
+ super
11
+ @name = token[:literal]
12
+ @value = nil
13
+ end
14
+
15
+ def value=(token)
16
+ @position_end = token.position_end
17
+ @value = token
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class Base
6
+ attr_reader :position_start, :position_end, :original_token
7
+
8
+ def initialize(token)
9
+ @original_token = token
10
+ @position_start = Position.new(line: token[:start_line], column: token[:start_column], offset: token[:start_offset])
11
+ @position_end = Position.new(line: token[:end_line], column: token[:end_column], offset: token[:end_offset])
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class Comment < Base
6
+ attr_accessor :literal
7
+
8
+ def initialize(token)
9
+ super
10
+ @literal = token[:literal]
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class Executable < Base
6
+ attr_accessor :source
7
+
8
+ def initialize(token)
9
+ super
10
+ @source = token[:literal]
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class Interpolation < Base
6
+ attr_accessor :values
7
+
8
+ def initialize(token)
9
+ super
10
+ @values = [AST::String.new(token)]
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class Literal < Base
6
+ attr_accessor :value
7
+
8
+ def initialize(token)
9
+ super
10
+ @value = token[:literal]
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class PlainText < Base
6
+ attr_accessor :literal
7
+
8
+ def initialize(token)
9
+ super
10
+ @literal = token[:literal]
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class Position
6
+ attr_reader :line, :column, :offset
7
+
8
+ def initialize(line:, column:, offset:)
9
+ @line = line
10
+ @column = column
11
+ @offset = offset
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class String < Base
6
+ attr_accessor :quote, :literal
7
+
8
+ def initialize(token)
9
+ super
10
+ @literal = token[:literal].gsub(/\\#{token[:quote_char]}/, token[:quote_char])
11
+ @quote = token[:quote_char]
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ module AST
5
+ class Tag < Base
6
+ attr_accessor :name, :self_closing, :attributes, :children, :bad_tag
7
+ alias self_closing? self_closing
8
+ alias bad_tag? bad_tag
9
+
10
+ def initialize(token)
11
+ super
12
+ lit = token[:literal]
13
+ if lit.start_with? "</"
14
+ @bad_tag = true
15
+ @name = token[:literal][2...]
16
+ else
17
+ @name = token[:literal][1...]
18
+ end
19
+
20
+ @self_closing = false
21
+ @attributes = []
22
+ @children = []
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "ast/position"
4
+ require_relative "ast/base"
5
+ require_relative "ast/attr"
6
+ require_relative "ast/tag"
7
+ require_relative "ast/plain_text"
8
+ require_relative "ast/literal"
9
+ require_relative "ast/string"
10
+ require_relative "ast/executable"
11
+ require_relative "ast/interpolation"
12
+ require_relative "ast/comment"
13
+
14
+ module MiniHTML
15
+ module AST
16
+ end
17
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ class Parser
5
+ attr_reader :stream
6
+
7
+ def initialize(source)
8
+ scanner = MiniHTML::Scanner.new(source)
9
+ tokens = scanner.tokenize
10
+ raise ParseError.new(*scanner.errors) unless scanner.errors.empty?
11
+
12
+ @stream = MiniHTML::TokenStream.new(tokens)
13
+ @tokens = []
14
+ end
15
+
16
+ def parse
17
+ @tokens << parse_one until stream.empty?
18
+ @tokens
19
+ end
20
+
21
+ def parse_one
22
+ case stream.peek_kind
23
+ when :literal
24
+ AST::PlainText.new(stream.consume)
25
+ when :tag_begin
26
+ if stream.peek[:literal] == "<!--"
27
+ parse_comment
28
+ else
29
+ parse_tag
30
+ end
31
+ when :attr_value_literal
32
+ AST::Literal.new(stream.consume)
33
+ when :string
34
+ AST::String.new(stream.consume)
35
+ when :executable
36
+ AST::Executable.new(stream.consume)
37
+ when :string_interpolation
38
+ parse_string_interpolation
39
+ when :tag_closing_start
40
+ tag = AST::Tag.new(stream.consume)
41
+ discard_until_tag_end
42
+ tag
43
+ else
44
+ raise "Unexpected token type #{stream.peek_kind} on #parse_one"
45
+ end
46
+ end
47
+
48
+ def parse_comment
49
+ stream.consume
50
+ AST::Comment.new(stream.consume) unless stream.empty?
51
+ end
52
+
53
+ def parse_string_interpolation
54
+ interp = AST::Interpolation.new(stream.consume)
55
+ until stream.empty?
56
+ case stream.peek_kind
57
+ when :executable
58
+ interp.values << parse_one
59
+ when :string_interpolation
60
+ interp.values << AST::String.new(stream.consume)
61
+ when :string
62
+ interp.values << AST::String.new(stream.consume)
63
+ return interp
64
+ when :interpolated_executable
65
+ interp.values << AST::Executable.new(stream.consume)
66
+ else
67
+ raise "Unexpected token type #{stream.peek_kind} on #parse_string_interpolation"
68
+ end
69
+ end
70
+ interp
71
+ end
72
+
73
+ def discard_until_tag_end
74
+ stream.consume until stream.empty? || %i[tag_end tag_closing_end].include?(stream.peek_kind)
75
+ stream.consume # Consume tag_end or tag_closing_end
76
+ end
77
+
78
+ def parse_tag
79
+ tag = AST::Tag.new(stream.consume)
80
+ until stream.empty?
81
+ case stream.peek_kind
82
+ when :right_angled
83
+ stream.consume
84
+ # This tag has children...
85
+ tag.children << parse_one until stream.peek_kind == :tag_closing_start || stream.empty?
86
+ when :tag_closing_start
87
+ if stream.peek[:literal][2...] == tag.name
88
+ # Consume everything until a closing_end
89
+ discard_until_tag_end
90
+ return tag
91
+ end
92
+ when :tag_end
93
+ stream.consume
94
+ tag.self_closing = true
95
+ return tag
96
+ when :attr_key
97
+ tag.attributes << parse_attr
98
+ else
99
+ raise "Unexpected token type #{stream.peek_kind} on #parse_tag"
100
+ end
101
+ end
102
+ end
103
+
104
+ def parse_attr
105
+ att = AST::Attr.new(stream.consume)
106
+ return att unless stream.peek_kind == :equal
107
+
108
+ stream.consume # equal
109
+ att.value = parse_one
110
+ att
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MiniHTML
4
+ VERSION = "0.1.0"
5
+ end
data/lib/minihtml.rb ADDED
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "minihtml/version"
4
+ require_relative "minihtml/minihtml_scanner"
5
+ require_relative "minihtml/minihtml_token_stream"
6
+
7
+ require_relative "minihtml/ast"
8
+ require_relative "minihtml/parser"
9
+
10
+ module MiniHTML
11
+ class Error < StandardError; end
12
+
13
+ class ParseError < Error
14
+ attr_reader :errors
15
+
16
+ def initialize(*errors)
17
+ @errors = errors
18
+ super("Parsing failed with #{errors.length} error#{"s" if errors.length > 1}: #{errors.join(", ")}")
19
+ end
20
+ end
21
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: minihtml
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Vito Sartori
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: MiniHTML is a small HTML parser intended for React-like HTML-JSX syntax
13
+ email:
14
+ - hey@vito.io
15
+ executables: []
16
+ extensions:
17
+ - ext/minihtml_scanner/extconf.rb
18
+ - ext/minihtml_token_stream/extconf.rb
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".editorconfig"
22
+ - ".rspec"
23
+ - ".rubocop.yml"
24
+ - ".ruby-version"
25
+ - CODE_OF_CONDUCT.md
26
+ - LICENSE
27
+ - README.md
28
+ - Rakefile
29
+ - ext/minihtml_scanner/extconf.rb
30
+ - ext/minihtml_scanner/minihtml_scanner.c
31
+ - ext/minihtml_scanner/minihtml_scanner.h
32
+ - ext/minihtml_token_stream/extconf.rb
33
+ - ext/minihtml_token_stream/minihtml_token_stream.c
34
+ - lib/minihtml.rb
35
+ - lib/minihtml/ast.rb
36
+ - lib/minihtml/ast/attr.rb
37
+ - lib/minihtml/ast/base.rb
38
+ - lib/minihtml/ast/comment.rb
39
+ - lib/minihtml/ast/executable.rb
40
+ - lib/minihtml/ast/interpolation.rb
41
+ - lib/minihtml/ast/literal.rb
42
+ - lib/minihtml/ast/plain_text.rb
43
+ - lib/minihtml/ast/position.rb
44
+ - lib/minihtml/ast/string.rb
45
+ - lib/minihtml/ast/tag.rb
46
+ - lib/minihtml/parser.rb
47
+ - lib/minihtml/version.rb
48
+ homepage: https://github.com/heyvito/minihtml
49
+ licenses:
50
+ - MIT
51
+ metadata:
52
+ allowed_push_host: https://rubygems.org
53
+ homepage_uri: https://github.com/heyvito/minihtml
54
+ source_code_uri: https://github.com/heyvito/minihtml
55
+ changelog_uri: https://github.com/heyvito/minihtml
56
+ rubygems_mfa_required: 'true'
57
+ rdoc_options: []
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: '3.4'
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ requirements: []
71
+ rubygems_version: 3.6.9
72
+ specification_version: 4
73
+ summary: MiniHTML is a small HTML parser intended for React-like HTML-JSX syntax
74
+ test_files: []