html_tokenizer 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ci.yml +19 -0
- data/.gitignore +2 -0
- data/ext/html_tokenizer_ext/extconf.rb +2 -2
- data/ext/html_tokenizer_ext/parser.c +7 -4
- data/ext/html_tokenizer_ext/tokenizer.c +1 -4
- data/html_tokenizer.gemspec +13 -5
- data/lib/html_tokenizer/version.rb +5 -0
- data/lib/html_tokenizer.rb +2 -0
- metadata +18 -56
- data/Gemfile.lock +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2bd91c4080202f5c9d62d494f843f73a3f12c24a24d1408bc09495a161756d4e
|
4
|
+
data.tar.gz: 484eba0fbc765e4894d63d60cbcc335032278d8699f56888e00f1782c4bd1466
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b6a469d11e44df8898e1e30e18ae6048974691de52de34b3a9c781598145f42415d2a738337a587b4fc86853f06c6acf7f20e86e972ec96016dea37ae9291b2
|
7
|
+
data.tar.gz: 24622424f4abc8ee5ea2cc519024391ca8ef8c49d2c7f29d080256137e34284e64a7cf2b768fb6d773e987f40cf3cdf854e994dd812bf2719efda3f60414af9c
|
@@ -0,0 +1,19 @@
|
|
1
|
+
name: CI
|
2
|
+
on: [push]
|
3
|
+
|
4
|
+
jobs:
|
5
|
+
tests:
|
6
|
+
runs-on: ubuntu-latest
|
7
|
+
strategy:
|
8
|
+
matrix:
|
9
|
+
ruby: [ '2.5', '2.6', '2.7', '3.0', '3.1', '3.2', '3.3' ]
|
10
|
+
name: Ruby ${{ matrix.ruby }} Tests
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v4
|
13
|
+
- name: Set up Ruby
|
14
|
+
uses: ruby/setup-ruby@v1
|
15
|
+
with:
|
16
|
+
ruby-version: ${{ matrix.ruby }}
|
17
|
+
bundler-cache: true
|
18
|
+
- name: Run tests
|
19
|
+
run: bundle exec rake
|
data/.gitignore
CHANGED
@@ -455,8 +455,8 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
|
|
455
455
|
enc = rb_enc_from_index(parser->doc.enc_index);
|
456
456
|
mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
|
457
457
|
rb_yield_values(5, token_type_to_symbol(type),
|
458
|
-
|
459
|
-
|
458
|
+
ULONG2NUM(ref.mb_start), ULONG2NUM(ref.mb_start + mb_strlen),
|
459
|
+
ULONG2NUM(ref.line_number), ULONG2NUM(ref.column_number));
|
460
460
|
}
|
461
461
|
|
462
462
|
parser_adjust_line_number(parser, ref.start, ref.length);
|
@@ -495,7 +495,10 @@ static VALUE parser_initialize_method(VALUE self)
|
|
495
495
|
|
496
496
|
static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
|
497
497
|
{
|
498
|
+
#ifdef DEBUG
|
498
499
|
void *old = parser->doc.data;
|
500
|
+
#endif
|
501
|
+
|
499
502
|
unsigned long int mb_length;
|
500
503
|
char *buf;
|
501
504
|
rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
|
@@ -718,7 +721,7 @@ static VALUE parser_errors_count_method(VALUE self)
|
|
718
721
|
{
|
719
722
|
struct parser_t *parser = NULL;
|
720
723
|
Parser_Get_Struct(self, parser);
|
721
|
-
return
|
724
|
+
return ULONG2NUM(parser->errors_count);
|
722
725
|
}
|
723
726
|
|
724
727
|
static VALUE create_parser_error(struct parser_document_error_t *error)
|
@@ -734,7 +737,7 @@ static VALUE create_parser_error(struct parser_document_error_t *error)
|
|
734
737
|
return rb_class_new_instance(4, args, klass);
|
735
738
|
}
|
736
739
|
|
737
|
-
static VALUE parser_errors_method(VALUE self
|
740
|
+
static VALUE parser_errors_method(VALUE self)
|
738
741
|
{
|
739
742
|
struct parser_t *parser = NULL;
|
740
743
|
VALUE list;
|
@@ -135,7 +135,7 @@ static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, lo
|
|
135
135
|
{
|
136
136
|
long unsigned int mb_length = tokenizer_mblength(tk, length);
|
137
137
|
tk->last_token = type;
|
138
|
-
rb_yield_values(3, token_type_to_symbol(type),
|
138
|
+
rb_yield_values(3, token_type_to_symbol(type), ULONG2NUM(tk->scan.mb_cursor), ULONG2NUM(tk->scan.mb_cursor + mb_length));
|
139
139
|
}
|
140
140
|
|
141
141
|
static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
|
@@ -464,11 +464,9 @@ static int scan_tag_name(struct tokenizer_t *tk)
|
|
464
464
|
{
|
465
465
|
unsigned long int length = 0, tag_name_length = 0;
|
466
466
|
const char *tag_name = NULL;
|
467
|
-
void *old;
|
468
467
|
|
469
468
|
if(is_tag_name(&tk->scan, &tag_name, &tag_name_length)) {
|
470
469
|
length = (tk->current_tag ? strlen(tk->current_tag) : 0);
|
471
|
-
old = tk->current_tag;
|
472
470
|
REALLOC_N(tk->current_tag, char, length + tag_name_length + 1);
|
473
471
|
DBG_PRINT("tk=%p realloc(tk->current_tag) %p -> %p length=%lu", tk, old,
|
474
472
|
tk->current_tag, length + tag_name_length + 1);
|
@@ -664,7 +662,6 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
|
|
664
662
|
|
665
663
|
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
|
666
664
|
{
|
667
|
-
const char *old = tk->scan.string;
|
668
665
|
REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
|
669
666
|
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
670
667
|
tk->scan.string, length + 1);
|
data/html_tokenizer.gemspec
CHANGED
@@ -1,9 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/html_tokenizer/version"
|
4
|
+
|
1
5
|
Gem::Specification.new do |spec|
|
2
6
|
spec.name = "html_tokenizer"
|
3
|
-
spec.version =
|
7
|
+
spec.version = HtmlTokenizer::VERSION
|
4
8
|
spec.summary = "HTML Tokenizer"
|
5
9
|
spec.author = "Francois Chagnon"
|
6
10
|
|
11
|
+
spec.homepage = "https://github.com/Shopify/html_tokenizer"
|
12
|
+
spec.license = "MIT"
|
13
|
+
|
14
|
+
spec.metadata["allowed_push_host"] = "https://rubygems.org/"
|
15
|
+
|
16
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
17
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
18
|
+
|
7
19
|
spec.files = Dir.glob("ext/**/*.{c,h,rb}") +
|
8
20
|
Dir.glob("lib/**/*.rb")
|
9
21
|
|
@@ -12,8 +24,4 @@ Gem::Specification.new do |spec|
|
|
12
24
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
13
25
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
14
26
|
spec.require_paths = ["lib", "ext"]
|
15
|
-
|
16
|
-
spec.add_development_dependency 'rake', '~> 0'
|
17
|
-
spec.add_development_dependency 'rake-compiler', '~> 0'
|
18
|
-
spec.add_development_dependency 'minitest', '~> 0'
|
19
27
|
end
|
data/lib/html_tokenizer.rb
CHANGED
metadata
CHANGED
@@ -1,59 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francois Chagnon
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
13
|
-
|
14
|
-
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: rake-compiler
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: minitest
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
description:
|
56
|
-
email:
|
11
|
+
date: 2024-03-20 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email:
|
57
15
|
executables:
|
58
16
|
- html_tokenizer
|
59
17
|
extensions:
|
@@ -61,9 +19,9 @@ extensions:
|
|
61
19
|
extra_rdoc_files: []
|
62
20
|
files:
|
63
21
|
- ".autotest"
|
22
|
+
- ".github/workflows/ci.yml"
|
64
23
|
- ".gitignore"
|
65
24
|
- Gemfile
|
66
|
-
- Gemfile.lock
|
67
25
|
- LICENSE
|
68
26
|
- Manifest.txt
|
69
27
|
- README.md
|
@@ -78,12 +36,17 @@ files:
|
|
78
36
|
- ext/html_tokenizer_ext/tokenizer.h
|
79
37
|
- html_tokenizer.gemspec
|
80
38
|
- lib/html_tokenizer.rb
|
39
|
+
- lib/html_tokenizer/version.rb
|
81
40
|
- test/unit/parser_test.rb
|
82
41
|
- test/unit/tokenizer_test.rb
|
83
|
-
homepage:
|
84
|
-
licenses:
|
85
|
-
|
86
|
-
|
42
|
+
homepage: https://github.com/Shopify/html_tokenizer
|
43
|
+
licenses:
|
44
|
+
- MIT
|
45
|
+
metadata:
|
46
|
+
allowed_push_host: https://rubygems.org/
|
47
|
+
homepage_uri: https://github.com/Shopify/html_tokenizer
|
48
|
+
source_code_uri: https://github.com/Shopify/html_tokenizer
|
49
|
+
post_install_message:
|
87
50
|
rdoc_options: []
|
88
51
|
require_paths:
|
89
52
|
- lib
|
@@ -99,9 +62,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
62
|
- !ruby/object:Gem::Version
|
100
63
|
version: '0'
|
101
64
|
requirements: []
|
102
|
-
|
103
|
-
|
104
|
-
signing_key:
|
65
|
+
rubygems_version: 3.5.5
|
66
|
+
signing_key:
|
105
67
|
specification_version: 4
|
106
68
|
summary: HTML Tokenizer
|
107
69
|
test_files:
|
data/Gemfile.lock
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
html_tokenizer (0.0.6)
|
5
|
-
|
6
|
-
GEM
|
7
|
-
remote: https://rubygems.org/
|
8
|
-
specs:
|
9
|
-
minitest (5.9.0)
|
10
|
-
rake (12.3.0)
|
11
|
-
rake-compiler (0.9.9)
|
12
|
-
rake
|
13
|
-
|
14
|
-
PLATFORMS
|
15
|
-
ruby
|
16
|
-
|
17
|
-
DEPENDENCIES
|
18
|
-
html_tokenizer!
|
19
|
-
minitest
|
20
|
-
rake
|
21
|
-
rake-compiler
|
22
|
-
|
23
|
-
BUNDLED WITH
|
24
|
-
1.16.0
|