srx 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 18c2fc1da5f4a792393a9dd4955c79d73bc9d00943000f38689a5763d8c8ff4c
4
- data.tar.gz: 2385b9f65e61ad291e1419e217b737653bbeea0d4f62dc7fbb4ae35e6948984c
3
+ metadata.gz: 248fc2b9cd220023564c26a792536b1593b2bdefe77b71e133805658bd169d13
4
+ data.tar.gz: 7aa58a5e7fa2255219aaf83f7df1b09ebde3711a59c84d0cce7b3811af6f869e
5
5
  SHA512:
6
- metadata.gz: 915b837e6a239f7de688d51ca3fca2bbdcf898234f825cbd49894223e8c31f71167d6a22d9097a36ab75ea3fc8f383ae790d3df8e0e3fce2fb8210022412fad2
7
- data.tar.gz: 326d3de2c328aa49be07c7df62229fe40c8cf3b2c4101f090b9c805b9e3a2fcd1c8eec155b6b3af9c556ff5c548d5263425975bdc9b0368b334342ce47dd71d9
6
+ metadata.gz: 0af9311397924bf58131fd753e879f63f32c3706998f5cbb9c3c06ab51cb626a2417d31cd52dba2d905b571d5489efc1ad088a334f7b868fdd66e8e82a64f547
7
+ data.tar.gz: 969da7b926a664411a887948f17ff011d8e4040ef970e71d2ecde92ff05e39039aeb2a51238326be83418fda7bdf279241145c024cf41bccc8d4d515027cc039
@@ -13,7 +13,7 @@ jobs:
13
13
  ruby-version: 2.7.2
14
14
  - name: Install
15
15
  run: |
16
- gem install bundler -v 2.2.7
16
+ gem install bundler -v 2.2.9
17
17
  bundle install
18
18
  - name: Type check
19
19
  run: bundle exec solargraph typecheck --level typed
data/.rubocop.yml CHANGED
@@ -10,4 +10,4 @@ AllCops:
10
10
  Layout/LineLength:
11
11
  Max: 120
12
12
  Exclude:
13
- - 'test/srx/golden_rules_test.rb'
13
+ - 'test/golden_rules_test.rb'
data/CHANGELOG.md ADDED
@@ -0,0 +1,9 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.2.0] - 2021-02-13
4
+
5
+ - Handle HTML void elements correctly
6
+
7
+ ## [0.1.0] - 2021-02-13
8
+
9
+ - Initial release
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- srx (0.1.0)
4
+ srx (0.2.0)
5
5
  nokogiri (~> 1.11)
6
6
 
7
7
  GEM
@@ -81,4 +81,4 @@ DEPENDENCIES
81
81
  srx!
82
82
 
83
83
  BUNDLED WITH
84
- 2.2.7
84
+ 2.2.9
data/README.md CHANGED
@@ -13,7 +13,10 @@ This gem provides facilities for reading SRX files and an engine for performing
13
13
  segmentation.
14
14
 
15
15
  Only a minimal rule set is supplied by default; for actual usage you are
16
- encouraged to supply your own SRX rules.
16
+ encouraged to supply your own SRX rules. One such set of rules is that from
17
+ [LanguageTool](https://languagetool.org/); this is conveniently packaged into a
18
+ companion gem:
19
+ [srx-languagetool-ruby](https://github.com/amake/srx-languagetool-ruby).
17
20
 
18
21
  ## What's different about this gem?
19
22
 
@@ -43,7 +46,7 @@ Some disadvantages:
43
46
  test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
44
47
  scoring 47% (English) and 48% (others) with the default rules. However you can
45
48
  improve on that with better rules such as
46
- [LanguageTool's](https://github.com/languagetool-org/languagetool/blob/05707300df14668e97d064811931e0668f2b695b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx).
49
+ [LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
47
50
 
48
51
  ## Installation
49
52
 
data/lib/srx/format.rb CHANGED
@@ -3,6 +3,7 @@
3
3
  require_relative 'format/base_format'
4
4
  require_relative 'format/text'
5
5
  require_relative 'format/xml'
6
+ require_relative 'format/html'
6
7
 
7
8
  module Srx
8
9
  # Format-specific data and logic
@@ -10,7 +11,7 @@ module Srx
10
11
  FORMATS = {
11
12
  text: Text.new,
12
13
  xml: Xml.new,
13
- html: Xml.new # TODO: specialize for HTML
14
+ html: Html.new
14
15
  }.freeze
15
16
 
16
17
  class << self
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+ require 'English'
5
+
6
+ module Srx
7
+ module Format
8
+ # Support for XML
9
+ #
10
+ # @see https://www.w3.org/TR/xml/
11
+ class Html < Xml
12
+ START_TAG_CAPTURE = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{Xml::ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
13
+
14
+ # A set of HTML tags that are "void elements", meaning they do not need a
15
+ # paired closing tag.
16
+ #
17
+ # @see https://html.spec.whatwg.org/#void-elements
18
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command
19
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/keygen
20
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/menuitem
21
+ VOID_ELEMENTS = Set[
22
+ 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
23
+ 'link', 'meta', 'menuitem', 'param', 'source', 'track', 'wbr'
24
+ ].freeze
25
+
26
+ def start_formatting?(markup)
27
+ START_TAG_CAPTURE.match(markup) do |m|
28
+ !VOID_ELEMENTS.include?(m['name'])
29
+ end
30
+ end
31
+
32
+ def isolated_formatting?(markup)
33
+ return true if super(markup)
34
+
35
+ START_TAG_CAPTURE.match(markup) do |m|
36
+ VOID_ELEMENTS.include?(m['name'])
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
data/lib/srx/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Srx
4
- VERSION = '0.1.0'
4
+ VERSION = '0.2.0'
5
5
  end
data/srx.gemspec CHANGED
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
15
15
 
16
16
  spec.metadata['homepage_uri'] = spec.homepage
17
17
  spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
18
+ spec.metadata['changelog_uri'] = 'https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md'
18
19
 
19
20
  # Specify which files should be added to the gem when it is released.
20
21
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: srx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
@@ -135,6 +135,7 @@ files:
135
135
  - ".rubocop.yml"
136
136
  - ".rubocop_todo.yml"
137
137
  - ".solargraph.yml"
138
+ - CHANGELOG.md
138
139
  - Gemfile
139
140
  - Gemfile.lock
140
141
  - LICENSE.txt
@@ -150,6 +151,7 @@ files:
150
151
  - lib/srx/engine.rb
151
152
  - lib/srx/format.rb
152
153
  - lib/srx/format/base_format.rb
154
+ - lib/srx/format/html.rb
153
155
  - lib/srx/format/text.rb
154
156
  - lib/srx/format/xml.rb
155
157
  - lib/srx/icu_regex.rb
@@ -163,6 +165,7 @@ licenses:
163
165
  metadata:
164
166
  homepage_uri: https://github.com/amake/srx-ruby
165
167
  source_code_uri: https://github.com/amake/srx-ruby.git
168
+ changelog_uri: https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md
166
169
  post_install_message:
167
170
  rdoc_options: []
168
171
  require_paths: