srx 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 153660bff1d0fc6c00bebc7fae34b11d9f70b16c54628ebf5023f8815b105b45
4
- data.tar.gz: 1e01d70b8f8a3cb032e62137f7ab1224e3942bb2314a47c05af295ff6431ba35
3
+ metadata.gz: f678f4fe5e7f30edbe600f27a5dd00217528d84291248ebf76f8d5c4df62f333
4
+ data.tar.gz: 5d096e1688a5c57756bb86be9a31d504aa99cbb877de434737e5c4578808f4f1
5
5
  SHA512:
6
- metadata.gz: e3ac37ecbb4bba2c31d90d32f3b4c0ae4cd236a70e6ab25c340f35c52e12364d4aad14dff01e0b8381fd28811ea46f6f861d1ff23540c0f280575cbbd24d1262
7
- data.tar.gz: b89f432b242e1bd8e2b8e5917888d78f75ef9749b2add35e9991a4d33e38be8876c110e107b994f27bdaf214612f231c1cd2041a0090759f053d839f7bd3d741
6
+ metadata.gz: 78d28e9ebb51c4ac61f2b15dc27d527f802a7a9c40812741ca817a14b0e231e0dce310f0b9c38411fb8dfe2f02953ca7344a90ad85fea5290d2b234be5795130
7
+ data.tar.gz: 2521273e4f164b44be0f43b2dd54cc17f380edc130e600c37cf8ef2b0ae847e287e91a59c036e638461462c7b2fc83401a5c348501429fe107becacbdd906188
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.6.0] - 2021-04-15
4
+
5
+ - Improved HTML parsing accuracy
6
+
3
7
  ## [0.5.0] - 2021-02-25
4
8
 
5
9
  - When `nil` is supplied for the `language` parameter, it is now treated as the
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- srx (0.5.0)
4
+ srx (0.6.0)
5
5
  nokogiri (~> 1.11)
6
6
 
7
7
  GEM
@@ -14,29 +14,29 @@ GEM
14
14
  diff-lcs (1.4.4)
15
15
  e2mmap (0.1.0)
16
16
  jaro_winkler (1.5.4)
17
- kramdown (2.3.0)
17
+ kramdown (2.3.1)
18
18
  rexml
19
19
  kramdown-parser-gfm (1.1.0)
20
20
  kramdown (~> 2.0)
21
21
  memory_profiler (1.0.0)
22
- minitest (5.14.3)
23
- nokogiri (1.11.1-x86_64-darwin)
22
+ minitest (5.14.4)
23
+ nokogiri (1.11.3-x86_64-darwin)
24
24
  racc (~> 1.4)
25
25
  parallel (1.20.1)
26
- parser (3.0.0.0)
26
+ parser (3.0.1.0)
27
27
  ast (~> 2.4.1)
28
28
  racc (1.5.2)
29
29
  rainbow (3.0.0)
30
30
  rake (13.0.3)
31
- regexp_parser (2.0.3)
31
+ regexp_parser (2.1.1)
32
32
  reverse_markdown (2.0.0)
33
33
  nokogiri
34
- rexml (3.2.4)
34
+ rexml (3.2.5)
35
35
  rspec-expectations (3.10.1)
36
36
  diff-lcs (>= 1.2.0, < 2.0)
37
37
  rspec-support (~> 3.10.0)
38
38
  rspec-support (3.10.2)
39
- rubocop (1.10.0)
39
+ rubocop (1.12.1)
40
40
  parallel (~> 1.10)
41
41
  parser (>= 3.0.0.0)
42
42
  rainbow (>= 2.2.2, < 4.0)
@@ -48,7 +48,7 @@ GEM
48
48
  rubocop-ast (1.4.1)
49
49
  parser (>= 2.7.1.5)
50
50
  ruby-progressbar (1.11.0)
51
- solargraph (0.40.3)
51
+ solargraph (0.40.4)
52
52
  backport (~> 1.1)
53
53
  benchmark
54
54
  bundler (>= 1.17.2)
@@ -5,11 +5,23 @@ require 'English'
5
5
 
6
6
  module Srx
7
7
  module Format
8
- # Support for XML
8
+ # Support for HTML. Tag grammar based on XML.
9
9
  #
10
10
  # @see https://www.w3.org/TR/xml/
11
+ # @see https://html.spec.whatwg.org/multipage/syntax.html
11
12
  class Html < Xml
12
- START_TAG_CAPTURE = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{Xml::ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
13
+ # Differs from XML in supporting unquoted values
14
+ # @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
15
+ ATT_VALUE = /#{Xml::ATT_VALUE}|(?:[^<>&"'`=\u0020\u0009\u000D\u000A]|#{Xml::REFERENCE})+/.freeze
16
+
17
+ # Differs from XML in supporting empty attributes
18
+ # @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
19
+ ATTRIBUTE = /#{Xml::NAME}(?:#{Xml::EQUALS}#{ATT_VALUE})?/.freeze
20
+
21
+ START_TAG = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
22
+ EMPTY_ELEM_TAG = %r{<#{Xml::NAME}(?:#{Xml::SPACE}#{ATTRIBUTE})*#{Xml::SPACE}?/>}.freeze
23
+
24
+ TAG = /#{START_TAG}|#{Xml::END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
13
25
 
14
26
  # A set of HTML tags that are "void elements", meaning they do not need a
15
27
  # paired closing tag.
@@ -23,16 +35,20 @@ module Srx
23
35
  'link', 'meta', 'menuitem', 'param', 'source', 'track', 'wbr'
24
36
  ].freeze
25
37
 
38
+ def extract_markups(str)
39
+ extract_markups_by_pattern(str, TAG)
40
+ end
41
+
26
42
  def start_formatting?(markup)
27
- START_TAG_CAPTURE.match(markup) do |m|
43
+ START_TAG.match(markup) do |m|
28
44
  !VOID_ELEMENTS.include?(m['name'])
29
45
  end
30
46
  end
31
47
 
32
48
  def isolated_formatting?(markup)
33
- return true if super(markup)
49
+ return true if EMPTY_ELEM_TAG.match?(markup)
34
50
 
35
- START_TAG_CAPTURE.match(markup) do |m|
51
+ START_TAG.match(markup) do |m|
36
52
  VOID_ELEMENTS.include?(m['name'])
37
53
  end
38
54
  end
@@ -27,14 +27,7 @@ module Srx
27
27
  TAG = /#{START_TAG}|#{END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
28
28
 
29
29
  def extract_markups(str)
30
- markups = []
31
-
32
- plain_text = str.gsub(TAG) do |match|
33
- markups << [$LAST_MATCH_INFO.begin(0), match]
34
- ''
35
- end
36
-
37
- [plain_text, markups]
30
+ extract_markups_by_pattern(str, TAG)
38
31
  end
39
32
 
40
33
  def start_formatting?(markup)
@@ -48,6 +41,21 @@ module Srx
48
41
  def isolated_formatting?(markup)
49
42
  EMPTY_ELEM_TAG.match?(markup)
50
43
  end
44
+
45
+ protected
46
+
47
+ # @param str [String]
48
+ # @param pattern [Regexp]
49
+ def extract_markups_by_pattern(str, pattern)
50
+ markups = []
51
+
52
+ plain_text = str.gsub(pattern) do |match|
53
+ markups << [$LAST_MATCH_INFO.begin(0), match]
54
+ ''
55
+ end
56
+
57
+ [plain_text, markups]
58
+ end
51
59
  end
52
60
  end
53
61
  end
data/lib/srx/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Srx
4
- VERSION = '0.5.0'
4
+ VERSION = '0.6.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: srx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-02-25 00:00:00.000000000 Z
11
+ date: 2021-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri