srx 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 153660bff1d0fc6c00bebc7fae34b11d9f70b16c54628ebf5023f8815b105b45
4
- data.tar.gz: 1e01d70b8f8a3cb032e62137f7ab1224e3942bb2314a47c05af295ff6431ba35
3
+ metadata.gz: f678f4fe5e7f30edbe600f27a5dd00217528d84291248ebf76f8d5c4df62f333
4
+ data.tar.gz: 5d096e1688a5c57756bb86be9a31d504aa99cbb877de434737e5c4578808f4f1
5
5
  SHA512:
6
- metadata.gz: e3ac37ecbb4bba2c31d90d32f3b4c0ae4cd236a70e6ab25c340f35c52e12364d4aad14dff01e0b8381fd28811ea46f6f861d1ff23540c0f280575cbbd24d1262
7
- data.tar.gz: b89f432b242e1bd8e2b8e5917888d78f75ef9749b2add35e9991a4d33e38be8876c110e107b994f27bdaf214612f231c1cd2041a0090759f053d839f7bd3d741
6
+ metadata.gz: 78d28e9ebb51c4ac61f2b15dc27d527f802a7a9c40812741ca817a14b0e231e0dce310f0b9c38411fb8dfe2f02953ca7344a90ad85fea5290d2b234be5795130
7
+ data.tar.gz: 2521273e4f164b44be0f43b2dd54cc17f380edc130e600c37cf8ef2b0ae847e287e91a59c036e638461462c7b2fc83401a5c348501429fe107becacbdd906188
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.6.0] - 2021-04-15
4
+
5
+ - Improved HTML parsing accuracy
6
+
3
7
  ## [0.5.0] - 2021-02-25
4
8
 
5
9
  - When `nil` is supplied for the `language` parameter, it is now treated as the
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- srx (0.5.0)
4
+ srx (0.6.0)
5
5
  nokogiri (~> 1.11)
6
6
 
7
7
  GEM
@@ -14,29 +14,29 @@ GEM
14
14
  diff-lcs (1.4.4)
15
15
  e2mmap (0.1.0)
16
16
  jaro_winkler (1.5.4)
17
- kramdown (2.3.0)
17
+ kramdown (2.3.1)
18
18
  rexml
19
19
  kramdown-parser-gfm (1.1.0)
20
20
  kramdown (~> 2.0)
21
21
  memory_profiler (1.0.0)
22
- minitest (5.14.3)
23
- nokogiri (1.11.1-x86_64-darwin)
22
+ minitest (5.14.4)
23
+ nokogiri (1.11.3-x86_64-darwin)
24
24
  racc (~> 1.4)
25
25
  parallel (1.20.1)
26
- parser (3.0.0.0)
26
+ parser (3.0.1.0)
27
27
  ast (~> 2.4.1)
28
28
  racc (1.5.2)
29
29
  rainbow (3.0.0)
30
30
  rake (13.0.3)
31
- regexp_parser (2.0.3)
31
+ regexp_parser (2.1.1)
32
32
  reverse_markdown (2.0.0)
33
33
  nokogiri
34
- rexml (3.2.4)
34
+ rexml (3.2.5)
35
35
  rspec-expectations (3.10.1)
36
36
  diff-lcs (>= 1.2.0, < 2.0)
37
37
  rspec-support (~> 3.10.0)
38
38
  rspec-support (3.10.2)
39
- rubocop (1.10.0)
39
+ rubocop (1.12.1)
40
40
  parallel (~> 1.10)
41
41
  parser (>= 3.0.0.0)
42
42
  rainbow (>= 2.2.2, < 4.0)
@@ -48,7 +48,7 @@ GEM
48
48
  rubocop-ast (1.4.1)
49
49
  parser (>= 2.7.1.5)
50
50
  ruby-progressbar (1.11.0)
51
- solargraph (0.40.3)
51
+ solargraph (0.40.4)
52
52
  backport (~> 1.1)
53
53
  benchmark
54
54
  bundler (>= 1.17.2)
@@ -5,11 +5,23 @@ require 'English'
5
5
 
6
6
  module Srx
7
7
  module Format
8
- # Support for XML
8
+ # Support for HTML. Tag grammar based on XML.
9
9
  #
10
10
  # @see https://www.w3.org/TR/xml/
11
+ # @see https://html.spec.whatwg.org/multipage/syntax.html
11
12
  class Html < Xml
12
- START_TAG_CAPTURE = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{Xml::ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
13
+ # Differs from XML in supporting unquoted values
14
+ # @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
15
+ ATT_VALUE = /#{Xml::ATT_VALUE}|(?:[^<>&"'`=\u0020\u0009\u000D\u000A]|#{Xml::REFERENCE})+/.freeze
16
+
17
+ # Differs from XML in supporting empty attributes
18
+ # @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
19
+ ATTRIBUTE = /#{Xml::NAME}(?:#{Xml::EQUALS}#{ATT_VALUE})?/.freeze
20
+
21
+ START_TAG = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
22
+ EMPTY_ELEM_TAG = %r{<#{Xml::NAME}(?:#{Xml::SPACE}#{ATTRIBUTE})*#{Xml::SPACE}?/>}.freeze
23
+
24
+ TAG = /#{START_TAG}|#{Xml::END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
13
25
 
14
26
  # A set of HTML tags that are "void elements", meaning they do not need a
15
27
  # paired closing tag.
@@ -23,16 +35,20 @@ module Srx
23
35
  'link', 'meta', 'menuitem', 'param', 'source', 'track', 'wbr'
24
36
  ].freeze
25
37
 
38
+ def extract_markups(str)
39
+ extract_markups_by_pattern(str, TAG)
40
+ end
41
+
26
42
  def start_formatting?(markup)
27
- START_TAG_CAPTURE.match(markup) do |m|
43
+ START_TAG.match(markup) do |m|
28
44
  !VOID_ELEMENTS.include?(m['name'])
29
45
  end
30
46
  end
31
47
 
32
48
  def isolated_formatting?(markup)
33
- return true if super(markup)
49
+ return true if EMPTY_ELEM_TAG.match?(markup)
34
50
 
35
- START_TAG_CAPTURE.match(markup) do |m|
51
+ START_TAG.match(markup) do |m|
36
52
  VOID_ELEMENTS.include?(m['name'])
37
53
  end
38
54
  end
@@ -27,14 +27,7 @@ module Srx
27
27
  TAG = /#{START_TAG}|#{END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
28
28
 
29
29
  def extract_markups(str)
30
- markups = []
31
-
32
- plain_text = str.gsub(TAG) do |match|
33
- markups << [$LAST_MATCH_INFO.begin(0), match]
34
- ''
35
- end
36
-
37
- [plain_text, markups]
30
+ extract_markups_by_pattern(str, TAG)
38
31
  end
39
32
 
40
33
  def start_formatting?(markup)
@@ -48,6 +41,21 @@ module Srx
48
41
  def isolated_formatting?(markup)
49
42
  EMPTY_ELEM_TAG.match?(markup)
50
43
  end
44
+
45
+ protected
46
+
47
+ # @param str [String]
48
+ # @param pattern [Regexp]
49
+ def extract_markups_by_pattern(str, pattern)
50
+ markups = []
51
+
52
+ plain_text = str.gsub(pattern) do |match|
53
+ markups << [$LAST_MATCH_INFO.begin(0), match]
54
+ ''
55
+ end
56
+
57
+ [plain_text, markups]
58
+ end
51
59
  end
52
60
  end
53
61
  end
data/lib/srx/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Srx
4
- VERSION = '0.5.0'
4
+ VERSION = '0.6.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: srx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-02-25 00:00:00.000000000 Z
11
+ date: 2021-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri