srx 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 18c2fc1da5f4a792393a9dd4955c79d73bc9d00943000f38689a5763d8c8ff4c
4
- data.tar.gz: 2385b9f65e61ad291e1419e217b737653bbeea0d4f62dc7fbb4ae35e6948984c
3
+ metadata.gz: 248fc2b9cd220023564c26a792536b1593b2bdefe77b71e133805658bd169d13
4
+ data.tar.gz: 7aa58a5e7fa2255219aaf83f7df1b09ebde3711a59c84d0cce7b3811af6f869e
5
5
  SHA512:
6
- metadata.gz: 915b837e6a239f7de688d51ca3fca2bbdcf898234f825cbd49894223e8c31f71167d6a22d9097a36ab75ea3fc8f383ae790d3df8e0e3fce2fb8210022412fad2
7
- data.tar.gz: 326d3de2c328aa49be07c7df62229fe40c8cf3b2c4101f090b9c805b9e3a2fcd1c8eec155b6b3af9c556ff5c548d5263425975bdc9b0368b334342ce47dd71d9
6
+ metadata.gz: 0af9311397924bf58131fd753e879f63f32c3706998f5cbb9c3c06ab51cb626a2417d31cd52dba2d905b571d5489efc1ad088a334f7b868fdd66e8e82a64f547
7
+ data.tar.gz: 969da7b926a664411a887948f17ff011d8e4040ef970e71d2ecde92ff05e39039aeb2a51238326be83418fda7bdf279241145c024cf41bccc8d4d515027cc039
@@ -13,7 +13,7 @@ jobs:
13
13
  ruby-version: 2.7.2
14
14
  - name: Install
15
15
  run: |
16
- gem install bundler -v 2.2.7
16
+ gem install bundler -v 2.2.9
17
17
  bundle install
18
18
  - name: Type check
19
19
  run: bundle exec solargraph typecheck --level typed
data/.rubocop.yml CHANGED
@@ -10,4 +10,4 @@ AllCops:
10
10
  Layout/LineLength:
11
11
  Max: 120
12
12
  Exclude:
13
- - 'test/srx/golden_rules_test.rb'
13
+ - 'test/golden_rules_test.rb'
data/CHANGELOG.md ADDED
@@ -0,0 +1,9 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.2.0] - 2021-02-13
4
+
5
+ - Handle HTML void elements correctly
6
+
7
+ ## [0.1.0] - 2021-02-13
8
+
9
+ - Initial release
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- srx (0.1.0)
4
+ srx (0.2.0)
5
5
  nokogiri (~> 1.11)
6
6
 
7
7
  GEM
@@ -81,4 +81,4 @@ DEPENDENCIES
81
81
  srx!
82
82
 
83
83
  BUNDLED WITH
84
- 2.2.7
84
+ 2.2.9
data/README.md CHANGED
@@ -13,7 +13,10 @@ This gem provides facilities for reading SRX files and an engine for performing
13
13
  segmentation.
14
14
 
15
15
  Only a minimal rule set is supplied by default; for actual usage you are
16
- encouraged to supply your own SRX rules.
16
+ encouraged to supply your own SRX rules. One such set of rules is that from
17
+ [LanguageTool](https://languagetool.org/); this is conveniently packaged into a
18
+ companion gem:
19
+ [srx-languagetool-ruby](https://github.com/amake/srx-languagetool-ruby).
17
20
 
18
21
  ## What's different about this gem?
19
22
 
@@ -43,7 +46,7 @@ Some disadvantages:
43
46
  test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
44
47
  scoring 47% (English) and 48% (others) with the default rules. However you can
45
48
  improve on that with better rules such as
46
- [LanguageTool's](https://github.com/languagetool-org/languagetool/blob/05707300df14668e97d064811931e0668f2b695b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx).
49
+ [LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
47
50
 
48
51
  ## Installation
49
52
 
data/lib/srx/format.rb CHANGED
@@ -3,6 +3,7 @@
3
3
  require_relative 'format/base_format'
4
4
  require_relative 'format/text'
5
5
  require_relative 'format/xml'
6
+ require_relative 'format/html'
6
7
 
7
8
  module Srx
8
9
  # Format-specific data and logic
@@ -10,7 +11,7 @@ module Srx
10
11
  FORMATS = {
11
12
  text: Text.new,
12
13
  xml: Xml.new,
13
- html: Xml.new # TODO: specialize for HTML
14
+ html: Html.new
14
15
  }.freeze
15
16
 
16
17
  class << self
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+ require 'English'
5
+
6
+ module Srx
7
+ module Format
8
+ # Support for XML
9
+ #
10
+ # @see https://www.w3.org/TR/xml/
11
+ class Html < Xml
12
+ START_TAG_CAPTURE = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{Xml::ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
13
+
14
+ # A set of HTML tags that are "void elements", meaning they do not need a
15
+ # paired closing tag.
16
+ #
17
+ # @see https://html.spec.whatwg.org/#void-elements
18
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command
19
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/keygen
20
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/menuitem
21
+ VOID_ELEMENTS = Set[
22
+ 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
23
+ 'link', 'meta', 'menuitem', 'param', 'source', 'track', 'wbr'
24
+ ].freeze
25
+
26
+ def start_formatting?(markup)
27
+ START_TAG_CAPTURE.match(markup) do |m|
28
+ !VOID_ELEMENTS.include?(m['name'])
29
+ end
30
+ end
31
+
32
+ def isolated_formatting?(markup)
33
+ return true if super(markup)
34
+
35
+ START_TAG_CAPTURE.match(markup) do |m|
36
+ VOID_ELEMENTS.include?(m['name'])
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
data/lib/srx/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Srx
4
- VERSION = '0.1.0'
4
+ VERSION = '0.2.0'
5
5
  end
data/srx.gemspec CHANGED
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
15
15
 
16
16
  spec.metadata['homepage_uri'] = spec.homepage
17
17
  spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
18
+ spec.metadata['changelog_uri'] = 'https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md'
18
19
 
19
20
  # Specify which files should be added to the gem when it is released.
20
21
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: srx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
@@ -135,6 +135,7 @@ files:
135
135
  - ".rubocop.yml"
136
136
  - ".rubocop_todo.yml"
137
137
  - ".solargraph.yml"
138
+ - CHANGELOG.md
138
139
  - Gemfile
139
140
  - Gemfile.lock
140
141
  - LICENSE.txt
@@ -150,6 +151,7 @@ files:
150
151
  - lib/srx/engine.rb
151
152
  - lib/srx/format.rb
152
153
  - lib/srx/format/base_format.rb
154
+ - lib/srx/format/html.rb
153
155
  - lib/srx/format/text.rb
154
156
  - lib/srx/format/xml.rb
155
157
  - lib/srx/icu_regex.rb
@@ -163,6 +165,7 @@ licenses:
163
165
  metadata:
164
166
  homepage_uri: https://github.com/amake/srx-ruby
165
167
  source_code_uri: https://github.com/amake/srx-ruby.git
168
+ changelog_uri: https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md
166
169
  post_install_message:
167
170
  rdoc_options: []
168
171
  require_paths: