srx 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +1 -1
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +2 -2
- data/README.md +5 -2
- data/lib/srx/format.rb +2 -1
- data/lib/srx/format/html.rb +41 -0
- data/lib/srx/version.rb +1 -1
- data/srx.gemspec +1 -0
- metadata +4 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 248fc2b9cd220023564c26a792536b1593b2bdefe77b71e133805658bd169d13
|
4
|
+
data.tar.gz: 7aa58a5e7fa2255219aaf83f7df1b09ebde3711a59c84d0cce7b3811af6f869e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0af9311397924bf58131fd753e879f63f32c3706998f5cbb9c3c06ab51cb626a2417d31cd52dba2d905b571d5489efc1ad088a334f7b868fdd66e8e82a64f547
|
7
|
+
data.tar.gz: 969da7b926a664411a887948f17ff011d8e4040ef970e71d2ecde92ff05e39039aeb2a51238326be83418fda7bdf279241145c024cf41bccc8d4d515027cc039
|
data/.github/workflows/main.yml
CHANGED
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
ADDED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -13,7 +13,10 @@ This gem provides facilities for reading SRX files and an engine for performing
|
|
13
13
|
segmentation.
|
14
14
|
|
15
15
|
Only a minimal rule set is supplied by default; for actual usage you are
|
16
|
-
encouraged to supply your own SRX rules.
|
16
|
+
encouraged to supply your own SRX rules. One such set of rules is that from
|
17
|
+
[LanguageTool](https://languagetool.org/); this is conveniently packaged into a
|
18
|
+
companion gem:
|
19
|
+
[srx-languagetool-ruby](https://github.com/amake/srx-languagetool-ruby).
|
17
20
|
|
18
21
|
## What's different about this gem?
|
19
22
|
|
@@ -43,7 +46,7 @@ Some disadvantages:
|
|
43
46
|
test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
|
44
47
|
scoring 47% (English) and 48% (others) with the default rules. However you can
|
45
48
|
improve on that with better rules such as
|
46
|
-
[LanguageTool's](https://github.com/
|
49
|
+
[LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
|
47
50
|
|
48
51
|
## Installation
|
49
52
|
|
data/lib/srx/format.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
require_relative 'format/base_format'
|
4
4
|
require_relative 'format/text'
|
5
5
|
require_relative 'format/xml'
|
6
|
+
require_relative 'format/html'
|
6
7
|
|
7
8
|
module Srx
|
8
9
|
# Format-specific data and logic
|
@@ -10,7 +11,7 @@ module Srx
|
|
10
11
|
FORMATS = {
|
11
12
|
text: Text.new,
|
12
13
|
xml: Xml.new,
|
13
|
-
html:
|
14
|
+
html: Html.new
|
14
15
|
}.freeze
|
15
16
|
|
16
17
|
class << self
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
require 'English'
|
5
|
+
|
6
|
+
module Srx
|
7
|
+
module Format
|
8
|
+
# Support for XML
|
9
|
+
#
|
10
|
+
# @see https://www.w3.org/TR/xml/
|
11
|
+
class Html < Xml
|
12
|
+
START_TAG_CAPTURE = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{Xml::ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
|
13
|
+
|
14
|
+
# A set of HTML tags that are "void elements", meaning they do not need a
|
15
|
+
# paired closing tag.
|
16
|
+
#
|
17
|
+
# @see https://html.spec.whatwg.org/#void-elements
|
18
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command
|
19
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/keygen
|
20
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/menuitem
|
21
|
+
VOID_ELEMENTS = Set[
|
22
|
+
'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
|
23
|
+
'link', 'meta', 'menuitem', 'param', 'source', 'track', 'wbr'
|
24
|
+
].freeze
|
25
|
+
|
26
|
+
def start_formatting?(markup)
|
27
|
+
START_TAG_CAPTURE.match(markup) do |m|
|
28
|
+
!VOID_ELEMENTS.include?(m['name'])
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def isolated_formatting?(markup)
|
33
|
+
return true if super(markup)
|
34
|
+
|
35
|
+
START_TAG_CAPTURE.match(markup) do |m|
|
36
|
+
VOID_ELEMENTS.include?(m['name'])
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/srx/version.rb
CHANGED
data/srx.gemspec
CHANGED
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
|
|
15
15
|
|
16
16
|
spec.metadata['homepage_uri'] = spec.homepage
|
17
17
|
spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
|
18
|
+
spec.metadata['changelog_uri'] = 'https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md'
|
18
19
|
|
19
20
|
# Specify which files should be added to the gem when it is released.
|
20
21
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: srx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
@@ -135,6 +135,7 @@ files:
|
|
135
135
|
- ".rubocop.yml"
|
136
136
|
- ".rubocop_todo.yml"
|
137
137
|
- ".solargraph.yml"
|
138
|
+
- CHANGELOG.md
|
138
139
|
- Gemfile
|
139
140
|
- Gemfile.lock
|
140
141
|
- LICENSE.txt
|
@@ -150,6 +151,7 @@ files:
|
|
150
151
|
- lib/srx/engine.rb
|
151
152
|
- lib/srx/format.rb
|
152
153
|
- lib/srx/format/base_format.rb
|
154
|
+
- lib/srx/format/html.rb
|
153
155
|
- lib/srx/format/text.rb
|
154
156
|
- lib/srx/format/xml.rb
|
155
157
|
- lib/srx/icu_regex.rb
|
@@ -163,6 +165,7 @@ licenses:
|
|
163
165
|
metadata:
|
164
166
|
homepage_uri: https://github.com/amake/srx-ruby
|
165
167
|
source_code_uri: https://github.com/amake/srx-ruby.git
|
168
|
+
changelog_uri: https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md
|
166
169
|
post_install_message:
|
167
170
|
rdoc_options: []
|
168
171
|
require_paths:
|