rLexer 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/rLexer/tags.rb +12 -0
- data/lib/rLexer/tokenizer.rb +150 -0
- data/lib/rLexer.rb +1 -0
- metadata +46 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 9e539be93197746639346f23cbe8f1ef3ebf919be3936970071c0514cacbf311
|
|
4
|
+
data.tar.gz: abeab5b45dafbca62bacc24b02b6a4b262d84d02cbd4965c8f8bc93b0ce4b281
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 0152746babc3149ebc2f6efee3ec20f9753c0567a04e695b63d2326ab0318f7d32e0cb3a77f0bb341a7540210fcb57c5537434e96f7f054a7bd504eac393dc03
|
|
7
|
+
data.tar.gz: f5912634d0b97eafe9924bb5f50a0c1f72587f0e159999ceb0def40ef5a5488acf71373b8f866bb3febf68422e89ced67eab9e4daaeca2e85986fb673a75eddb
|
data/lib/rLexer/tags.rb
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
require_relative 'tags'
|
|
2
|
+
|
|
3
|
+
class Tokenizer
|
|
4
|
+
attr_accessor :html, :type, :Tags
|
|
5
|
+
|
|
6
|
+
def initialize(html)
|
|
7
|
+
@html = html.gsub('"', '\'')
|
|
8
|
+
@type = :EOF
|
|
9
|
+
@tokens = []
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def tokenize
|
|
13
|
+
@html.each_char.with_index do |ch, idx|
|
|
14
|
+
comment_end(idx)
|
|
15
|
+
next if @type == :COMMENT
|
|
16
|
+
if open_tag?(ch) or close_tag?(ch)
|
|
17
|
+
process(idx)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
consume_attributes
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def process(idx)
|
|
24
|
+
set_type(idx); consume(idx)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def consume(idx)
|
|
28
|
+
if @type == :COMMENT
|
|
29
|
+
consume_comment(idx)
|
|
30
|
+
elsif @type == :OPEN or @type == :CLOSE
|
|
31
|
+
consume_tag(idx)
|
|
32
|
+
elsif @type == :DOCTYPE
|
|
33
|
+
#consume_doctype(idx)
|
|
34
|
+
elsif @type == :DATA
|
|
35
|
+
consume_data(idx)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def set_type(idx)
|
|
40
|
+
if comment_start?(idx)
|
|
41
|
+
@type = :COMMENT
|
|
42
|
+
elsif end_tag?(idx)
|
|
43
|
+
@type = :CLOSE
|
|
44
|
+
elsif doctype?(idx)
|
|
45
|
+
@type = :DOCTYPE
|
|
46
|
+
elsif close_tag?(current_char(idx)) or comment_end?(idx)
|
|
47
|
+
@type = :DATA
|
|
48
|
+
elsif open_tag?(current_char(idx))
|
|
49
|
+
@type = :OPEN
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def set_token(slice)
|
|
54
|
+
@tokens.push([@type, slice])
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def consume_comment(idx)
|
|
58
|
+
slice = @html[idx..-1]
|
|
59
|
+
slice = slice[Tags::START_COMMENT.length..end_comment_index(slice)]
|
|
60
|
+
set_token(slice)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def consume_tag(idx)
|
|
64
|
+
slice = @html[idx..-1]
|
|
65
|
+
slice = slice[tag_index(slice)..slice.index(Tags::CLOSE_TAG) -1]
|
|
66
|
+
set_token(slice)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def consume_attributes
|
|
70
|
+
atts_new = []
|
|
71
|
+
@tokens.each.with_index do |token, i|
|
|
72
|
+
atts = token[1].split(' ')[1..-1]
|
|
73
|
+
if token[0] == :OPEN and !atts[0].nil?
|
|
74
|
+
atts_new.push([i, atts.join(' ').split("' ")])
|
|
75
|
+
end
|
|
76
|
+
@tokens[i][1] = @tokens[i][1].split(' ')[0] unless @tokens[i][0] == :COMMENT or @tokens[i][0] == :DATA
|
|
77
|
+
end
|
|
78
|
+
c = 1
|
|
79
|
+
atts_new.each.with_index do |x|
|
|
80
|
+
@tokens.insert(x[0] + c, [:ATTRIBUTES, x[1]])
|
|
81
|
+
c += 1
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def consume_data(idx)
|
|
86
|
+
return if next_char?(idx)
|
|
87
|
+
|
|
88
|
+
slice = @html[idx..-1]
|
|
89
|
+
slice = slice[Tags::CLOSE_TAG.length..slice.index(Tags::OPEN_TAG) -1]
|
|
90
|
+
slice.strip!
|
|
91
|
+
|
|
92
|
+
set_token(slice) unless slice == ''
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def current_char(idx)
|
|
96
|
+
@html[idx]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def end_comment_index(html)
|
|
100
|
+
idx = html.index(Tags::END_COMMENT)
|
|
101
|
+
(not idx.nil?) ? (idx + 2) - Tags::END_COMMENT.length : -1
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def tag_index(html)
|
|
105
|
+
(@type == :OPEN) ? Tags::OPEN_TAG.length : Tags::CLOSING_TAG.length
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def comment_end(idx)
|
|
109
|
+
return if not @type == :COMMENT
|
|
110
|
+
if comment_end?(idx)
|
|
111
|
+
set_type(idx)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def comment_end?(idx)
|
|
116
|
+
suitable?(idx, Tags::END_COMMENT)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def next_char?(idx)
|
|
120
|
+
@html[idx +1] == Tags::OPEN_TAG or @html[idx +1].nil?
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def end_tag?(idx)
|
|
124
|
+
suitable?(idx, Tags::CLOSING_TAG)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def doctype?(idx)
|
|
128
|
+
false
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def comment_start?(idx)
|
|
132
|
+
suitable?(idx, Tags::START_COMMENT)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def suitable?(idx, tag)
|
|
136
|
+
tag == @html.byteslice(idx, tag.length)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def open_tag?(char)
|
|
140
|
+
char == Tags::OPEN_TAG
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def close_tag?(char)
|
|
144
|
+
char == Tags::CLOSE_TAG
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def tokens
|
|
148
|
+
@tokens
|
|
149
|
+
end
|
|
150
|
+
end
|
data/lib/rLexer.rb
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require_relative 'rLexer/tokenizer'
|
metadata
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: rLexer
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.12
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Robert Holland
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2020-09-07 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description:
|
|
14
|
+
email:
|
|
15
|
+
- rlexerdevelopment@gmail.com
|
|
16
|
+
executables: []
|
|
17
|
+
extensions: []
|
|
18
|
+
extra_rdoc_files: []
|
|
19
|
+
files:
|
|
20
|
+
- lib/rLexer.rb
|
|
21
|
+
- lib/rLexer/tags.rb
|
|
22
|
+
- lib/rLexer/tokenizer.rb
|
|
23
|
+
homepage: https://github.com/whollandr94/rLexer
|
|
24
|
+
licenses:
|
|
25
|
+
- MIT
|
|
26
|
+
metadata: {}
|
|
27
|
+
post_install_message:
|
|
28
|
+
rdoc_options: []
|
|
29
|
+
require_paths:
|
|
30
|
+
- lib
|
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
32
|
+
requirements:
|
|
33
|
+
- - ">="
|
|
34
|
+
- !ruby/object:Gem::Version
|
|
35
|
+
version: '0'
|
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
41
|
+
requirements: []
|
|
42
|
+
rubygems_version: 3.1.2
|
|
43
|
+
signing_key:
|
|
44
|
+
specification_version: 4
|
|
45
|
+
summary: A simple HTML lexer/tokenizer written in Ruby.
|
|
46
|
+
test_files: []
|