srx 0.1.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +1 -1
- data/.rubocop.yml +5 -1
- data/.rubocop_todo.yml +5 -5
- data/CHANGELOG.md +29 -0
- data/Gemfile.lock +10 -10
- data/README.md +20 -3
- data/lib/srx/data.rb +9 -9
- data/lib/srx/engine.rb +17 -14
- data/lib/srx/format.rb +3 -2
- data/lib/srx/format/html.rb +57 -0
- data/lib/srx/format/xml.rb +16 -8
- data/lib/srx/icu_regex.rb +9 -2
- data/lib/srx/srx-20-sample.srx +1 -1
- data/lib/srx/version.rb +1 -1
- data/srx.gemspec +1 -0
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f678f4fe5e7f30edbe600f27a5dd00217528d84291248ebf76f8d5c4df62f333
|
|
4
|
+
data.tar.gz: 5d096e1688a5c57756bb86be9a31d504aa99cbb877de434737e5c4578808f4f1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 78d28e9ebb51c4ac61f2b15dc27d527f802a7a9c40812741ca817a14b0e231e0dce310f0b9c38411fb8dfe2f02953ca7344a90ad85fea5290d2b234be5795130
|
|
7
|
+
data.tar.gz: 2521273e4f164b44be0f43b2dd54cc17f380edc130e600c37cf8ef2b0ae847e287e91a59c036e638461462c7b2fc83401a5c348501429fe107becacbdd906188
|
data/.github/workflows/main.yml
CHANGED
data/.rubocop.yml
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
inherit_from: .rubocop_todo.yml
|
|
2
2
|
|
|
3
|
+
inherit_mode:
|
|
4
|
+
merge:
|
|
5
|
+
- Exclude
|
|
6
|
+
|
|
3
7
|
AllCops:
|
|
4
8
|
TargetRubyVersion: 2.4
|
|
5
9
|
SuggestExtensions: false
|
|
@@ -10,4 +14,4 @@ AllCops:
|
|
|
10
14
|
Layout/LineLength:
|
|
11
15
|
Max: 120
|
|
12
16
|
Exclude:
|
|
13
|
-
- 'test/
|
|
17
|
+
- 'test/golden_rules_test.rb'
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2021-02-
|
|
3
|
+
# on 2021-02-25 04:43:35 UTC using RuboCop version 1.10.0.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
|
8
8
|
|
|
9
|
-
# Offense count:
|
|
9
|
+
# Offense count: 5
|
|
10
10
|
# Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
|
|
11
11
|
Metrics/AbcSize:
|
|
12
12
|
Max: 24
|
|
@@ -17,15 +17,15 @@ Metrics/AbcSize:
|
|
|
17
17
|
Metrics/BlockLength:
|
|
18
18
|
Max: 269
|
|
19
19
|
|
|
20
|
-
# Offense count:
|
|
20
|
+
# Offense count: 2
|
|
21
21
|
# Configuration parameters: IgnoredMethods.
|
|
22
22
|
Metrics/CyclomaticComplexity:
|
|
23
23
|
Max: 9
|
|
24
24
|
|
|
25
|
-
# Offense count:
|
|
25
|
+
# Offense count: 9
|
|
26
26
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
|
27
27
|
Metrics/MethodLength:
|
|
28
|
-
Max:
|
|
28
|
+
Max: 26
|
|
29
29
|
|
|
30
30
|
# Offense count: 1
|
|
31
31
|
# Configuration parameters: IgnoredMethods.
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
## [Unreleased]
|
|
2
|
+
|
|
3
|
+
## [0.6.0] - 2021-04-15
|
|
4
|
+
|
|
5
|
+
- Improved HTML parsing accuracy
|
|
6
|
+
|
|
7
|
+
## [0.5.0] - 2021-02-25
|
|
8
|
+
|
|
9
|
+
- When `nil` is supplied for the `language` parameter, it is now treated as the
|
|
10
|
+
empty string for rule-matching purposes (previously it would match no rules)
|
|
11
|
+
|
|
12
|
+
## [0.4.0] - 2021-02-18
|
|
13
|
+
|
|
14
|
+
- Optimize memory usage
|
|
15
|
+
|
|
16
|
+
## [0.3.0] - 2021-02-16
|
|
17
|
+
|
|
18
|
+
- All `Srx::Engine` methods except `#segment` are now private
|
|
19
|
+
- ICU regex syntax `\xhhhh` is now no longer converted to Ruby regex, as this
|
|
20
|
+
syntax was not correct; it now must be `\x{hhhh}`
|
|
21
|
+
- ICU regex syntax `\0ooo` is now supported
|
|
22
|
+
|
|
23
|
+
## [0.2.0] - 2021-02-13
|
|
24
|
+
|
|
25
|
+
- Handle HTML void elements correctly
|
|
26
|
+
|
|
27
|
+
## [0.1.0] - 2021-02-13
|
|
28
|
+
|
|
29
|
+
- Initial release
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
srx (0.
|
|
4
|
+
srx (0.6.0)
|
|
5
5
|
nokogiri (~> 1.11)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -14,29 +14,29 @@ GEM
|
|
|
14
14
|
diff-lcs (1.4.4)
|
|
15
15
|
e2mmap (0.1.0)
|
|
16
16
|
jaro_winkler (1.5.4)
|
|
17
|
-
kramdown (2.3.
|
|
17
|
+
kramdown (2.3.1)
|
|
18
18
|
rexml
|
|
19
19
|
kramdown-parser-gfm (1.1.0)
|
|
20
20
|
kramdown (~> 2.0)
|
|
21
21
|
memory_profiler (1.0.0)
|
|
22
|
-
minitest (5.14.
|
|
23
|
-
nokogiri (1.11.
|
|
22
|
+
minitest (5.14.4)
|
|
23
|
+
nokogiri (1.11.3-x86_64-darwin)
|
|
24
24
|
racc (~> 1.4)
|
|
25
25
|
parallel (1.20.1)
|
|
26
|
-
parser (3.0.
|
|
26
|
+
parser (3.0.1.0)
|
|
27
27
|
ast (~> 2.4.1)
|
|
28
28
|
racc (1.5.2)
|
|
29
29
|
rainbow (3.0.0)
|
|
30
30
|
rake (13.0.3)
|
|
31
|
-
regexp_parser (2.
|
|
31
|
+
regexp_parser (2.1.1)
|
|
32
32
|
reverse_markdown (2.0.0)
|
|
33
33
|
nokogiri
|
|
34
|
-
rexml (3.2.
|
|
34
|
+
rexml (3.2.5)
|
|
35
35
|
rspec-expectations (3.10.1)
|
|
36
36
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
37
37
|
rspec-support (~> 3.10.0)
|
|
38
38
|
rspec-support (3.10.2)
|
|
39
|
-
rubocop (1.
|
|
39
|
+
rubocop (1.12.1)
|
|
40
40
|
parallel (~> 1.10)
|
|
41
41
|
parser (>= 3.0.0.0)
|
|
42
42
|
rainbow (>= 2.2.2, < 4.0)
|
|
@@ -48,7 +48,7 @@ GEM
|
|
|
48
48
|
rubocop-ast (1.4.1)
|
|
49
49
|
parser (>= 2.7.1.5)
|
|
50
50
|
ruby-progressbar (1.11.0)
|
|
51
|
-
solargraph (0.40.
|
|
51
|
+
solargraph (0.40.4)
|
|
52
52
|
backport (~> 1.1)
|
|
53
53
|
benchmark
|
|
54
54
|
bundler (>= 1.17.2)
|
|
@@ -81,4 +81,4 @@ DEPENDENCIES
|
|
|
81
81
|
srx!
|
|
82
82
|
|
|
83
83
|
BUNDLED WITH
|
|
84
|
-
2.2.
|
|
84
|
+
2.2.9
|
data/README.md
CHANGED
|
@@ -13,7 +13,10 @@ This gem provides facilities for reading SRX files and an engine for performing
|
|
|
13
13
|
segmentation.
|
|
14
14
|
|
|
15
15
|
Only a minimal rule set is supplied by default; for actual usage you are
|
|
16
|
-
encouraged to supply your own SRX rules.
|
|
16
|
+
encouraged to supply your own SRX rules. One such set of rules is that from
|
|
17
|
+
[LanguageTool](https://languagetool.org/); this is conveniently packaged into a
|
|
18
|
+
companion gem:
|
|
19
|
+
[srx-languagetool-ruby](https://github.com/amake/srx-languagetool-ruby).
|
|
17
20
|
|
|
18
21
|
## What's different about this gem?
|
|
19
22
|
|
|
@@ -43,7 +46,21 @@ Some disadvantages:
|
|
|
43
46
|
test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
|
|
44
47
|
scoring 47% (English) and 48% (others) with the default rules. However you can
|
|
45
48
|
improve on that with better rules such as
|
|
46
|
-
[LanguageTool's](https://github.com/
|
|
49
|
+
[LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
|
|
50
|
+
|
|
51
|
+
## Caveats
|
|
52
|
+
|
|
53
|
+
The SRX spec calls for [ICU regular
|
|
54
|
+
expressions](https://unicode-org.github.io/icu/userguide/strings/regexp.html),
|
|
55
|
+
but this library uses standard [Ruby
|
|
56
|
+
regexp](https://ruby-doc.org/core-2.7.0/Regexp.html). Please note:
|
|
57
|
+
|
|
58
|
+
- Not all ICU syntax is supported
|
|
59
|
+
- For supported syntax, in some cases the meaning of a regex may differ when
|
|
60
|
+
interpreted as Ruby regexp
|
|
61
|
+
- The following ICU syntax is supported through translation to Ruby syntax:
|
|
62
|
+
- `\x{hhhh}` → `\u{hhhh}`
|
|
63
|
+
- `\0ooo` → `\u{hhhh}`
|
|
47
64
|
|
|
48
65
|
## Installation
|
|
49
66
|
|
|
@@ -93,7 +110,7 @@ input = 'foo <bar baz="a. b."> bazinga'
|
|
|
93
110
|
Srx::Engine.new(Data.default).segment(input, language: 'en')
|
|
94
111
|
#=> ["foo <bar baz=\"a.", " b.\"> bazinga"]
|
|
95
112
|
|
|
96
|
-
Srx::Engine.new(
|
|
113
|
+
Srx::Engine.new(Data.default, format: :xml).segment(input, language: 'en')
|
|
97
114
|
#=> ["foo <bar baz=\"a. b.\"> bazinga"]
|
|
98
115
|
```
|
|
99
116
|
|
data/lib/srx/data.rb
CHANGED
|
@@ -43,23 +43,23 @@ module Srx
|
|
|
43
43
|
end
|
|
44
44
|
|
|
45
45
|
def segment_subflows?
|
|
46
|
-
header['segmentsubflows'] == 'yes'
|
|
46
|
+
@segment_subflows ||= header['segmentsubflows'] == 'yes'
|
|
47
47
|
end
|
|
48
48
|
|
|
49
49
|
def cascade?
|
|
50
|
-
header['cascade'] == 'yes'
|
|
50
|
+
@cascade ||= header['cascade'] == 'yes'
|
|
51
51
|
end
|
|
52
52
|
|
|
53
53
|
def include_start_formatting?
|
|
54
|
-
include_formatting?(:start)
|
|
54
|
+
@include_start_formatting ||= include_formatting?(:start)
|
|
55
55
|
end
|
|
56
56
|
|
|
57
57
|
def include_end_formatting?
|
|
58
|
-
include_formatting?(:end)
|
|
58
|
+
@include_end_formatting ||= include_formatting?(:end)
|
|
59
59
|
end
|
|
60
60
|
|
|
61
61
|
def include_isolated_formatting?
|
|
62
|
-
include_formatting?(:isolated)
|
|
62
|
+
@include_isolated_formatting ||= include_formatting?(:isolated)
|
|
63
63
|
end
|
|
64
64
|
|
|
65
65
|
# @return [Array<LanguageRule>]
|
|
@@ -108,7 +108,7 @@ module Srx
|
|
|
108
108
|
class LanguageRule < XmlWrapper
|
|
109
109
|
# @return [String]
|
|
110
110
|
def name
|
|
111
|
-
@xml['languagerulename']
|
|
111
|
+
@name ||= @xml['languagerulename']
|
|
112
112
|
end
|
|
113
113
|
|
|
114
114
|
# @return [Array<Rule>]
|
|
@@ -130,13 +130,13 @@ module Srx
|
|
|
130
130
|
# Eagerly load everything for this class because before_break and
|
|
131
131
|
# after_break can be legitimately nil, so lazy loading gets ugly.
|
|
132
132
|
|
|
133
|
-
@break = @xml['break'].nil? ||
|
|
133
|
+
@break = @xml['break'].then { |brk| brk.nil? || brk == 'yes' }
|
|
134
134
|
|
|
135
135
|
@before_break = xpath(:beforebreak).first&.text.then do |pattern|
|
|
136
136
|
IcuRegex.compile(pattern) if pattern
|
|
137
137
|
end
|
|
138
138
|
|
|
139
|
-
@after_break
|
|
139
|
+
@after_break = xpath(:afterbreak).first&.text.then do |pattern|
|
|
140
140
|
IcuRegex.compile(pattern) if pattern
|
|
141
141
|
end
|
|
142
142
|
end
|
|
@@ -155,7 +155,7 @@ module Srx
|
|
|
155
155
|
class LanguageMap < XmlWrapper
|
|
156
156
|
# @return [String]
|
|
157
157
|
def language_rule_name
|
|
158
|
-
@xml['languagerulename']
|
|
158
|
+
@language_rule_name ||= @xml['languagerulename']
|
|
159
159
|
end
|
|
160
160
|
|
|
161
161
|
# @return [Regexp]
|
data/lib/srx/engine.rb
CHANGED
|
@@ -7,7 +7,7 @@ module Srx
|
|
|
7
7
|
attr_reader :data
|
|
8
8
|
|
|
9
9
|
# @param data [Data]
|
|
10
|
-
# @param
|
|
10
|
+
# @param format [Symbol] see {Format#get}
|
|
11
11
|
def initialize(data, format: :text)
|
|
12
12
|
@data = data
|
|
13
13
|
@format = Format.get(format)
|
|
@@ -31,6 +31,8 @@ module Srx
|
|
|
31
31
|
results
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
+
private
|
|
35
|
+
|
|
34
36
|
# @param language [String]
|
|
35
37
|
# @return [Array<Data::Rule>]
|
|
36
38
|
def rules(language)
|
|
@@ -43,9 +45,11 @@ module Srx
|
|
|
43
45
|
names.flat_map { |name| rule_map[name].rules }
|
|
44
46
|
end
|
|
45
47
|
|
|
46
|
-
# @param language [String]
|
|
48
|
+
# @param language [String] nil treated as empty string
|
|
47
49
|
# @return [Array<String>]
|
|
48
50
|
def rule_names(language)
|
|
51
|
+
language ||= ''
|
|
52
|
+
|
|
49
53
|
@data.map_rules.map do |lang_map|
|
|
50
54
|
next unless lang_map.language_pattern.match?(language)
|
|
51
55
|
|
|
@@ -56,20 +60,19 @@ module Srx
|
|
|
56
60
|
end
|
|
57
61
|
|
|
58
62
|
# @param str [String]
|
|
59
|
-
# @param pos [Integer] the position to start searching from
|
|
60
63
|
# @param rules [Array<Data::LanguageRule::Rule>]
|
|
61
|
-
# @return [Array(Integer,Data::LanguageRule::Rule)] an array of
|
|
62
|
-
# position of a break, and 2) the rule that matched at that
|
|
63
|
-
# that the final break will always be at the end of the
|
|
64
|
-
# have an associated rule.
|
|
64
|
+
# @return [Array<Array(Integer,Data::LanguageRule::Rule)>] an array of pairs
|
|
65
|
+
# of 1) the position of a break, and 2) the rule that matched at that
|
|
66
|
+
# position. Note that the final break will always be at the end of the
|
|
67
|
+
# string and may not have an associated rule.
|
|
65
68
|
def breaks_by_pos(str, rules)
|
|
66
|
-
rules
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
69
|
+
grouped = rules.flat_map { |rule| all_matches(str, rule) }
|
|
70
|
+
.group_by(&:first)
|
|
71
|
+
grouped.transform_values! { |pairs| pairs.first.last }
|
|
72
|
+
grouped.select! { |_pos, rule| rule.break? }
|
|
73
|
+
result = grouped.sort_by(&:first)
|
|
74
|
+
result << [str.length] unless result.last&.first == str.length
|
|
75
|
+
result
|
|
73
76
|
end
|
|
74
77
|
|
|
75
78
|
# @param str [String]
|
data/lib/srx/format.rb
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require_relative 'format/base_format'
|
|
4
4
|
require_relative 'format/text'
|
|
5
5
|
require_relative 'format/xml'
|
|
6
|
+
require_relative 'format/html'
|
|
6
7
|
|
|
7
8
|
module Srx
|
|
8
9
|
# Format-specific data and logic
|
|
@@ -10,11 +11,11 @@ module Srx
|
|
|
10
11
|
FORMATS = {
|
|
11
12
|
text: Text.new,
|
|
12
13
|
xml: Xml.new,
|
|
13
|
-
html:
|
|
14
|
+
html: Html.new
|
|
14
15
|
}.freeze
|
|
15
16
|
|
|
16
17
|
class << self
|
|
17
|
-
# @param format [Symbol]
|
|
18
|
+
# @param format [Symbol] see keys of {FORMATS} for accepted values
|
|
18
19
|
# @return [BaseFormat]
|
|
19
20
|
def get(format)
|
|
20
21
|
raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
require 'English'
|
|
5
|
+
|
|
6
|
+
module Srx
|
|
7
|
+
module Format
|
|
8
|
+
# Support for HTML. Tag grammar based on XML.
|
|
9
|
+
#
|
|
10
|
+
# @see https://www.w3.org/TR/xml/
|
|
11
|
+
# @see https://html.spec.whatwg.org/multipage/syntax.html
|
|
12
|
+
class Html < Xml
|
|
13
|
+
# Differs from XML in supporting unquoted values
|
|
14
|
+
# @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
|
|
15
|
+
ATT_VALUE = /#{Xml::ATT_VALUE}|(?:[^<>&"'`=\u0020\u0009\u000D\u000A]|#{Xml::REFERENCE})+/.freeze
|
|
16
|
+
|
|
17
|
+
# Differs from XML in supporting empty attributes
|
|
18
|
+
# @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
|
|
19
|
+
ATTRIBUTE = /#{Xml::NAME}(?:#{Xml::EQUALS}#{ATT_VALUE})?/.freeze
|
|
20
|
+
|
|
21
|
+
START_TAG = /<(?<name>#{Xml::NAME})(?:#{Xml::SPACE}#{ATTRIBUTE})*#{Xml::SPACE}?>/.freeze
|
|
22
|
+
EMPTY_ELEM_TAG = %r{<#{Xml::NAME}(?:#{Xml::SPACE}#{ATTRIBUTE})*#{Xml::SPACE}?/>}.freeze
|
|
23
|
+
|
|
24
|
+
TAG = /#{START_TAG}|#{Xml::END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
|
|
25
|
+
|
|
26
|
+
# A set of HTML tags that are "void elements", meaning they do not need a
|
|
27
|
+
# paired closing tag.
|
|
28
|
+
#
|
|
29
|
+
# @see https://html.spec.whatwg.org/#void-elements
|
|
30
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command
|
|
31
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/keygen
|
|
32
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTML/Element/menuitem
|
|
33
|
+
VOID_ELEMENTS = Set[
|
|
34
|
+
'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
|
|
35
|
+
'link', 'meta', 'menuitem', 'param', 'source', 'track', 'wbr'
|
|
36
|
+
].freeze
|
|
37
|
+
|
|
38
|
+
def extract_markups(str)
|
|
39
|
+
extract_markups_by_pattern(str, TAG)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def start_formatting?(markup)
|
|
43
|
+
START_TAG.match(markup) do |m|
|
|
44
|
+
!VOID_ELEMENTS.include?(m['name'])
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def isolated_formatting?(markup)
|
|
49
|
+
return true if EMPTY_ELEM_TAG.match?(markup)
|
|
50
|
+
|
|
51
|
+
START_TAG.match(markup) do |m|
|
|
52
|
+
VOID_ELEMENTS.include?(m['name'])
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
data/lib/srx/format/xml.rb
CHANGED
|
@@ -27,14 +27,7 @@ module Srx
|
|
|
27
27
|
TAG = /#{START_TAG}|#{END_TAG}|#{EMPTY_ELEM_TAG}/.freeze
|
|
28
28
|
|
|
29
29
|
def extract_markups(str)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
plain_text = str.gsub(TAG) do |match|
|
|
33
|
-
markups << [$LAST_MATCH_INFO.begin(0), match]
|
|
34
|
-
''
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
[plain_text, markups]
|
|
30
|
+
extract_markups_by_pattern(str, TAG)
|
|
38
31
|
end
|
|
39
32
|
|
|
40
33
|
def start_formatting?(markup)
|
|
@@ -48,6 +41,21 @@ module Srx
|
|
|
48
41
|
def isolated_formatting?(markup)
|
|
49
42
|
EMPTY_ELEM_TAG.match?(markup)
|
|
50
43
|
end
|
|
44
|
+
|
|
45
|
+
protected
|
|
46
|
+
|
|
47
|
+
# @param str [String]
|
|
48
|
+
# @param pattern [Regexp]
|
|
49
|
+
def extract_markups_by_pattern(str, pattern)
|
|
50
|
+
markups = []
|
|
51
|
+
|
|
52
|
+
plain_text = str.gsub(pattern) do |match|
|
|
53
|
+
markups << [$LAST_MATCH_INFO.begin(0), match]
|
|
54
|
+
''
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
[plain_text, markups]
|
|
58
|
+
end
|
|
51
59
|
end
|
|
52
60
|
end
|
|
53
61
|
end
|
data/lib/srx/icu_regex.rb
CHANGED
|
@@ -3,13 +3,20 @@
|
|
|
3
3
|
module Srx
|
|
4
4
|
# Utilities for handling SRX (ICU) regular expressions
|
|
5
5
|
module IcuRegex
|
|
6
|
-
HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex
|
|
6
|
+
HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>\{[a-f0-9]{1,6}\})/i.freeze
|
|
7
|
+
OCTAL_PATTERN = /(?<!\\)(?:\\\\)*\\0(?<oct>[0-7]{1,3})/i.freeze
|
|
7
8
|
|
|
8
9
|
class << self
|
|
9
10
|
# @param icu_regex [String]
|
|
10
11
|
# @return [String]
|
|
11
12
|
def to_ruby(icu_regex)
|
|
12
|
-
icu_regex.
|
|
13
|
+
result = icu_regex.dup
|
|
14
|
+
result.gsub!(HEX_PATTERN, '\u\k<hex>')
|
|
15
|
+
result.gsub!(OCTAL_PATTERN) do |m|
|
|
16
|
+
$LAST_MATCH_INFO['oct'].to_i(8).then { |o| o <= 255 ? format(%q(\u{%x}), o) : m }
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
result
|
|
13
20
|
end
|
|
14
21
|
|
|
15
22
|
# @param icu_regex [String]
|
data/lib/srx/srx-20-sample.srx
CHANGED
|
@@ -68,7 +68,7 @@
|
|
|
68
68
|
\xff01: Fullwidth exclamation mark
|
|
69
69
|
-->
|
|
70
70
|
<rule break="yes">
|
|
71
|
-
<beforebreak>[\
|
|
71
|
+
<beforebreak>[\x{ff61}\x{3002}\x{ff0e}\x{ff1f}\x{ff01}]+</beforebreak>
|
|
72
72
|
<afterbreak></afterbreak>
|
|
73
73
|
</rule>
|
|
74
74
|
</languagerule>
|
data/lib/srx/version.rb
CHANGED
data/srx.gemspec
CHANGED
|
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
|
|
|
15
15
|
|
|
16
16
|
spec.metadata['homepage_uri'] = spec.homepage
|
|
17
17
|
spec.metadata['source_code_uri'] = 'https://github.com/amake/srx-ruby.git'
|
|
18
|
+
spec.metadata['changelog_uri'] = 'https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md'
|
|
18
19
|
|
|
19
20
|
# Specify which files should be added to the gem when it is released.
|
|
20
21
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: srx
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Aaron Madlon-Kay
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2021-
|
|
11
|
+
date: 2021-04-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: nokogiri
|
|
@@ -135,6 +135,7 @@ files:
|
|
|
135
135
|
- ".rubocop.yml"
|
|
136
136
|
- ".rubocop_todo.yml"
|
|
137
137
|
- ".solargraph.yml"
|
|
138
|
+
- CHANGELOG.md
|
|
138
139
|
- Gemfile
|
|
139
140
|
- Gemfile.lock
|
|
140
141
|
- LICENSE.txt
|
|
@@ -150,6 +151,7 @@ files:
|
|
|
150
151
|
- lib/srx/engine.rb
|
|
151
152
|
- lib/srx/format.rb
|
|
152
153
|
- lib/srx/format/base_format.rb
|
|
154
|
+
- lib/srx/format/html.rb
|
|
153
155
|
- lib/srx/format/text.rb
|
|
154
156
|
- lib/srx/format/xml.rb
|
|
155
157
|
- lib/srx/icu_regex.rb
|
|
@@ -163,6 +165,7 @@ licenses:
|
|
|
163
165
|
metadata:
|
|
164
166
|
homepage_uri: https://github.com/amake/srx-ruby
|
|
165
167
|
source_code_uri: https://github.com/amake/srx-ruby.git
|
|
168
|
+
changelog_uri: https://github.com/amake/srx-ruby/blob/master/CHANGELOG.md
|
|
166
169
|
post_install_message:
|
|
167
170
|
rdoc_options: []
|
|
168
171
|
require_paths:
|