html_tokenizer 0.0.6 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ci.yml +19 -0
- data/.gitignore +2 -0
- data/ext/html_tokenizer_ext/extconf.rb +2 -2
- data/ext/html_tokenizer_ext/parser.c +7 -4
- data/ext/html_tokenizer_ext/tokenizer.c +1 -4
- data/html_tokenizer.gemspec +13 -5
- data/lib/html_tokenizer/version.rb +5 -0
- data/lib/html_tokenizer.rb +2 -0
- metadata +18 -56
- data/Gemfile.lock +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2bd91c4080202f5c9d62d494f843f73a3f12c24a24d1408bc09495a161756d4e
|
4
|
+
data.tar.gz: 484eba0fbc765e4894d63d60cbcc335032278d8699f56888e00f1782c4bd1466
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b6a469d11e44df8898e1e30e18ae6048974691de52de34b3a9c781598145f42415d2a738337a587b4fc86853f06c6acf7f20e86e972ec96016dea37ae9291b2
|
7
|
+
data.tar.gz: 24622424f4abc8ee5ea2cc519024391ca8ef8c49d2c7f29d080256137e34284e64a7cf2b768fb6d773e987f40cf3cdf854e994dd812bf2719efda3f60414af9c
|
@@ -0,0 +1,19 @@
|
|
1
|
+
name: CI
|
2
|
+
on: [push]
|
3
|
+
|
4
|
+
jobs:
|
5
|
+
tests:
|
6
|
+
runs-on: ubuntu-latest
|
7
|
+
strategy:
|
8
|
+
matrix:
|
9
|
+
ruby: [ '2.5', '2.6', '2.7', '3.0', '3.1', '3.2', '3.3' ]
|
10
|
+
name: Ruby ${{ matrix.ruby }} Tests
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v4
|
13
|
+
- name: Set up Ruby
|
14
|
+
uses: ruby/setup-ruby@v1
|
15
|
+
with:
|
16
|
+
ruby-version: ${{ matrix.ruby }}
|
17
|
+
bundler-cache: true
|
18
|
+
- name: Run tests
|
19
|
+
run: bundle exec rake
|
data/.gitignore
CHANGED
@@ -455,8 +455,8 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
|
|
455
455
|
enc = rb_enc_from_index(parser->doc.enc_index);
|
456
456
|
mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
|
457
457
|
rb_yield_values(5, token_type_to_symbol(type),
|
458
|
-
|
459
|
-
|
458
|
+
ULONG2NUM(ref.mb_start), ULONG2NUM(ref.mb_start + mb_strlen),
|
459
|
+
ULONG2NUM(ref.line_number), ULONG2NUM(ref.column_number));
|
460
460
|
}
|
461
461
|
|
462
462
|
parser_adjust_line_number(parser, ref.start, ref.length);
|
@@ -495,7 +495,10 @@ static VALUE parser_initialize_method(VALUE self)
|
|
495
495
|
|
496
496
|
static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
|
497
497
|
{
|
498
|
+
#ifdef DEBUG
|
498
499
|
void *old = parser->doc.data;
|
500
|
+
#endif
|
501
|
+
|
499
502
|
unsigned long int mb_length;
|
500
503
|
char *buf;
|
501
504
|
rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
|
@@ -718,7 +721,7 @@ static VALUE parser_errors_count_method(VALUE self)
|
|
718
721
|
{
|
719
722
|
struct parser_t *parser = NULL;
|
720
723
|
Parser_Get_Struct(self, parser);
|
721
|
-
return
|
724
|
+
return ULONG2NUM(parser->errors_count);
|
722
725
|
}
|
723
726
|
|
724
727
|
static VALUE create_parser_error(struct parser_document_error_t *error)
|
@@ -734,7 +737,7 @@ static VALUE create_parser_error(struct parser_document_error_t *error)
|
|
734
737
|
return rb_class_new_instance(4, args, klass);
|
735
738
|
}
|
736
739
|
|
737
|
-
static VALUE parser_errors_method(VALUE self
|
740
|
+
static VALUE parser_errors_method(VALUE self)
|
738
741
|
{
|
739
742
|
struct parser_t *parser = NULL;
|
740
743
|
VALUE list;
|
@@ -135,7 +135,7 @@ static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, lo
|
|
135
135
|
{
|
136
136
|
long unsigned int mb_length = tokenizer_mblength(tk, length);
|
137
137
|
tk->last_token = type;
|
138
|
-
rb_yield_values(3, token_type_to_symbol(type),
|
138
|
+
rb_yield_values(3, token_type_to_symbol(type), ULONG2NUM(tk->scan.mb_cursor), ULONG2NUM(tk->scan.mb_cursor + mb_length));
|
139
139
|
}
|
140
140
|
|
141
141
|
static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
|
@@ -464,11 +464,9 @@ static int scan_tag_name(struct tokenizer_t *tk)
|
|
464
464
|
{
|
465
465
|
unsigned long int length = 0, tag_name_length = 0;
|
466
466
|
const char *tag_name = NULL;
|
467
|
-
void *old;
|
468
467
|
|
469
468
|
if(is_tag_name(&tk->scan, &tag_name, &tag_name_length)) {
|
470
469
|
length = (tk->current_tag ? strlen(tk->current_tag) : 0);
|
471
|
-
old = tk->current_tag;
|
472
470
|
REALLOC_N(tk->current_tag, char, length + tag_name_length + 1);
|
473
471
|
DBG_PRINT("tk=%p realloc(tk->current_tag) %p -> %p length=%lu", tk, old,
|
474
472
|
tk->current_tag, length + tag_name_length + 1);
|
@@ -664,7 +662,6 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
|
|
664
662
|
|
665
663
|
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
|
666
664
|
{
|
667
|
-
const char *old = tk->scan.string;
|
668
665
|
REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
|
669
666
|
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
670
667
|
tk->scan.string, length + 1);
|
data/html_tokenizer.gemspec
CHANGED
@@ -1,9 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/html_tokenizer/version"
|
4
|
+
|
1
5
|
Gem::Specification.new do |spec|
|
2
6
|
spec.name = "html_tokenizer"
|
3
|
-
spec.version =
|
7
|
+
spec.version = HtmlTokenizer::VERSION
|
4
8
|
spec.summary = "HTML Tokenizer"
|
5
9
|
spec.author = "Francois Chagnon"
|
6
10
|
|
11
|
+
spec.homepage = "https://github.com/Shopify/html_tokenizer"
|
12
|
+
spec.license = "MIT"
|
13
|
+
|
14
|
+
spec.metadata["allowed_push_host"] = "https://rubygems.org/"
|
15
|
+
|
16
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
17
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
18
|
+
|
7
19
|
spec.files = Dir.glob("ext/**/*.{c,h,rb}") +
|
8
20
|
Dir.glob("lib/**/*.rb")
|
9
21
|
|
@@ -12,8 +24,4 @@ Gem::Specification.new do |spec|
|
|
12
24
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
13
25
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
14
26
|
spec.require_paths = ["lib", "ext"]
|
15
|
-
|
16
|
-
spec.add_development_dependency 'rake', '~> 0'
|
17
|
-
spec.add_development_dependency 'rake-compiler', '~> 0'
|
18
|
-
spec.add_development_dependency 'minitest', '~> 0'
|
19
27
|
end
|
data/lib/html_tokenizer.rb
CHANGED
metadata
CHANGED
@@ -1,59 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francois Chagnon
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
13
|
-
|
14
|
-
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: rake-compiler
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: minitest
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
description:
|
56
|
-
email:
|
11
|
+
date: 2024-03-20 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email:
|
57
15
|
executables:
|
58
16
|
- html_tokenizer
|
59
17
|
extensions:
|
@@ -61,9 +19,9 @@ extensions:
|
|
61
19
|
extra_rdoc_files: []
|
62
20
|
files:
|
63
21
|
- ".autotest"
|
22
|
+
- ".github/workflows/ci.yml"
|
64
23
|
- ".gitignore"
|
65
24
|
- Gemfile
|
66
|
-
- Gemfile.lock
|
67
25
|
- LICENSE
|
68
26
|
- Manifest.txt
|
69
27
|
- README.md
|
@@ -78,12 +36,17 @@ files:
|
|
78
36
|
- ext/html_tokenizer_ext/tokenizer.h
|
79
37
|
- html_tokenizer.gemspec
|
80
38
|
- lib/html_tokenizer.rb
|
39
|
+
- lib/html_tokenizer/version.rb
|
81
40
|
- test/unit/parser_test.rb
|
82
41
|
- test/unit/tokenizer_test.rb
|
83
|
-
homepage:
|
84
|
-
licenses:
|
85
|
-
|
86
|
-
|
42
|
+
homepage: https://github.com/Shopify/html_tokenizer
|
43
|
+
licenses:
|
44
|
+
- MIT
|
45
|
+
metadata:
|
46
|
+
allowed_push_host: https://rubygems.org/
|
47
|
+
homepage_uri: https://github.com/Shopify/html_tokenizer
|
48
|
+
source_code_uri: https://github.com/Shopify/html_tokenizer
|
49
|
+
post_install_message:
|
87
50
|
rdoc_options: []
|
88
51
|
require_paths:
|
89
52
|
- lib
|
@@ -99,9 +62,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
62
|
- !ruby/object:Gem::Version
|
100
63
|
version: '0'
|
101
64
|
requirements: []
|
102
|
-
|
103
|
-
|
104
|
-
signing_key:
|
65
|
+
rubygems_version: 3.5.5
|
66
|
+
signing_key:
|
105
67
|
specification_version: 4
|
106
68
|
summary: HTML Tokenizer
|
107
69
|
test_files:
|
data/Gemfile.lock
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
html_tokenizer (0.0.6)
|
5
|
-
|
6
|
-
GEM
|
7
|
-
remote: https://rubygems.org/
|
8
|
-
specs:
|
9
|
-
minitest (5.9.0)
|
10
|
-
rake (12.3.0)
|
11
|
-
rake-compiler (0.9.9)
|
12
|
-
rake
|
13
|
-
|
14
|
-
PLATFORMS
|
15
|
-
ruby
|
16
|
-
|
17
|
-
DEPENDENCIES
|
18
|
-
html_tokenizer!
|
19
|
-
minitest
|
20
|
-
rake
|
21
|
-
rake-compiler
|
22
|
-
|
23
|
-
BUNDLED WITH
|
24
|
-
1.16.0
|