srx 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +1 -1
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +2 -2
- data/README.md +5 -2
- data/lib/srx/format.rb +2 -1
- data/lib/srx/format/html.rb +41 -0
- data/lib/srx/version.rb +1 -1
- data/srx.gemspec +1 -0
- metadata +4 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 248fc2b9cd220023564c26a792536b1593b2bdefe77b71e133805658bd169d13
|
4
|
+
data.tar.gz: 7aa58a5e7fa2255219aaf83f7df1b09ebde3711a59c84d0cce7b3811af6f869e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0af9311397924bf58131fd753e879f63f32c3706998f5cbb9c3c06ab51cb626a2417d31cd52dba2d905b571d5489efc1ad088a334f7b868fdd66e8e82a64f547
|
7
|
+
data.tar.gz: 969da7b926a664411a887948f17ff011d8e4040ef970e71d2ecde92ff05e39039aeb2a51238326be83418fda7bdf279241145c024cf41bccc8d4d515027cc039
|
data/.github/workflows/main.yml
CHANGED
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
ADDED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -13,7 +13,10 @@ This gem provides facilities for reading SRX files and an engine for performing
|
|
13
13
|
segmentation.
|
14
14
|
|
15
15
|
Only a minimal rule set is supplied by default; for actual usage you are
|
16
|
-
encouraged to supply your own SRX rules.
|
16
|
+
encouraged to supply your own SRX rules. One such set of rules is that from
|
17
|
+
[LanguageTool](https://languagetool.org/); this is conveniently packaged into a
|
18
|
+
companion gem:
|
19
|
+
[srx-languagetool-ruby](https://github.com/amake/srx-languagetool-ruby).
|
17
20
|
|
18
21
|
## What's different about this gem?
|
19
22
|
|
@@ -43,7 +46,7 @@ Some disadvantages:
|
|
43
46
|
test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
|
44
47
|
scoring 47% (English) and 48% (others) with the default rules. However you can
|
45
48
|
improve on that with better rules such as
|
46
|
-
[LanguageTool's](https://github.com/
|
49
|
+
[LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
|
47
50
|
|
48
51
|
## Installation
|
49
52
|
|
data/lib/srx/format.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
require_relative 'format/base_format'
|
4
4
|
require_relative 'format/text'
|
5
5
|
require_relative 'format/xml'
|
6
|
+
require_relative 'format/html'
|
6
7
|
|
7
8
|
module Srx
|
8
9
|
# Format-specific data and logic
|
@@ -10,7 +11,7 @@ module Srx
|
|
10
11
|
FORMATS = {
|
11
12
|
text: Text.new,
|
12
13
|
xml: Xml.new,
|
13
|
-
html:
|
14
|
+
html: Html.new
|
14
15
|
}.freeze
|
15
16
|
|
16
17
|
class << self
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
require 'English'
|
5
|
+
|
6
|
+
module Srx
|
7
|
+
module Format
|
8
|
+
# Support for XML
|
9
|
+
#
|
10
|
+
# @see https://www.w3.org/TR/xml/
|
11
|
+
class Html < Xml
|
12
|
+
START_TAG_CAPTURE = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{Xml::ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
|
13
|
+
|
14
|
+
# A set of HTML tags that are "void elements", meaning they do not need a
|
15
|
+
# paired closing tag.
|
16
|
+
#
|
17
|
+
# @see https://html.spec.whatwg.org/#void-elements
|
18
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command
|
19
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/keygen
|
20
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/menuitem
|
21
|
+
VOID_ELEMENTS = Set[
|
22
|
+
'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
|
23
|
+
'link', 'meta', 'menuitem', 'param', 'source', 'track', 'wbr'
|
24
|
+
].freeze
|
25
|
+
|
26
|
+
def start_formatting?(markup)
|
27
|
+
START_TAG_CAPTURE.match(markup) do |m|
|
28
|
+
!VOID_ELEMENTS.include?(m['name'])
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def isolated_formatting?(markup)
|
33
|
+
return true if super(markup)
|
34
|
+
|
35
|
+
START_TAG_CAPTURE.match(markup) do |m|
|
36
|
+
VOID_ELEMENTS.include?(m['name'])
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/srx/version.rb
CHANGED
data/srx.gemspec
CHANGED
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
|
|
15
15
|
|
16
16
|
spec.metadata['homepage_uri'] = spec.homepage
|
17
17
|
spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
|
18
|
+
spec.metadata['changelog_uri'] = 'https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md'
|
18
19
|
|
19
20
|
# Specify which files should be added to the gem when it is released.
|
20
21
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: srx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
@@ -135,6 +135,7 @@ files:
|
|
135
135
|
- ".rubocop.yml"
|
136
136
|
- ".rubocop_todo.yml"
|
137
137
|
- ".solargraph.yml"
|
138
|
+
- CHANGELOG.md
|
138
139
|
- Gemfile
|
139
140
|
- Gemfile.lock
|
140
141
|
- LICENSE.txt
|
@@ -150,6 +151,7 @@ files:
|
|
150
151
|
- lib/srx/engine.rb
|
151
152
|
- lib/srx/format.rb
|
152
153
|
- lib/srx/format/base_format.rb
|
154
|
+
- lib/srx/format/html.rb
|
153
155
|
- lib/srx/format/text.rb
|
154
156
|
- lib/srx/format/xml.rb
|
155
157
|
- lib/srx/icu_regex.rb
|
@@ -163,6 +165,7 @@ licenses:
|
|
163
165
|
metadata:
|
164
166
|
homepage_uri: https://github.com/amake/srx-ruby
|
165
167
|
source_code_uri: https://github.com/amake/srx-ruby.git
|
168
|
+
changelog_uri: https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md
|
166
169
|
post_install_message:
|
167
170
|
rdoc_options: []
|
168
171
|
require_paths:
|