srx 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +6 -6
- data/CHANGELOG.md +7 -0
- data/Gemfile.lock +2 -2
- data/README.md +15 -1
- data/lib/srx/engine.rb +7 -5
- data/lib/srx/format.rb +1 -1
- data/lib/srx/icu_regex.rb +9 -2
- data/lib/srx/srx-20-sample.srx +1 -1
- data/lib/srx/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c28a49fd42454ec9968d3a0c1066af93c14ec4ebcd68fc5dccb43b24c0cf13c7
|
4
|
+
data.tar.gz: 268e0beffe2d9a2a9b189b9a5ce649cc026ed066d3dc168767db9d99ccdc174d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c7edc8f3fc9895e434b20551fc1dbccb82731e6500302375db3a3ffce730fe96a78163d662da637c1455449cb459abf751f616ba38e77d4594258611831b503
|
7
|
+
data.tar.gz: 2e0a72a250e180ba909ac2aa4c31c0ec85bca10c4cf40979910c9fb4123e47154402b7e554dbf62a7095e623a4ac6c29188f89d6a1ee2dfce9d8a7a1c978fa12
|
data/.rubocop_todo.yml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2021-02-
|
3
|
+
# on 2021-02-15 14:53:27 UTC using RuboCop version 1.9.1.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
-
# Offense count:
|
9
|
+
# Offense count: 6
|
10
10
|
# Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
|
11
11
|
Metrics/AbcSize:
|
12
|
-
Max:
|
12
|
+
Max: 26
|
13
13
|
|
14
14
|
# Offense count: 6
|
15
15
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
@@ -17,17 +17,17 @@ Metrics/AbcSize:
|
|
17
17
|
Metrics/BlockLength:
|
18
18
|
Max: 269
|
19
19
|
|
20
|
-
# Offense count:
|
20
|
+
# Offense count: 2
|
21
21
|
# Configuration parameters: IgnoredMethods.
|
22
22
|
Metrics/CyclomaticComplexity:
|
23
23
|
Max: 9
|
24
24
|
|
25
|
-
# Offense count:
|
25
|
+
# Offense count: 9
|
26
26
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
27
27
|
Metrics/MethodLength:
|
28
28
|
Max: 25
|
29
29
|
|
30
|
-
# Offense count:
|
30
|
+
# Offense count: 2
|
31
31
|
# Configuration parameters: IgnoredMethods.
|
32
32
|
Metrics/PerceivedComplexity:
|
33
33
|
Max: 10
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.3.0] - 2021-02-16
|
4
|
+
|
5
|
+
- All `Srx::Engine` methods except `#segment` are now private
|
6
|
+
- ICU regex syntax `\xhhhh` is now no longer converted to Ruby regex, as this
|
7
|
+
syntax was not correct; it now must be `\x{hhhh}`
|
8
|
+
- ICU regex syntax `\0ooo` is now supported
|
9
|
+
|
3
10
|
## [0.2.0] - 2021-02-13
|
4
11
|
|
5
12
|
- Handle HTML void elements correctly
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
srx (0.
|
4
|
+
srx (0.3.0)
|
5
5
|
nokogiri (~> 1.11)
|
6
6
|
|
7
7
|
GEM
|
@@ -36,7 +36,7 @@ GEM
|
|
36
36
|
diff-lcs (>= 1.2.0, < 2.0)
|
37
37
|
rspec-support (~> 3.10.0)
|
38
38
|
rspec-support (3.10.2)
|
39
|
-
rubocop (1.
|
39
|
+
rubocop (1.10.0)
|
40
40
|
parallel (~> 1.10)
|
41
41
|
parser (>= 3.0.0.0)
|
42
42
|
rainbow (>= 2.2.2, < 4.0)
|
data/README.md
CHANGED
@@ -48,6 +48,20 @@ Some disadvantages:
|
|
48
48
|
improve on that with better rules such as
|
49
49
|
[LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
|
50
50
|
|
51
|
+
## Caveats
|
52
|
+
|
53
|
+
The SRX spec calls for [ICU regular
|
54
|
+
expressions](https://unicode-org.github.io/icu/userguide/strings/regexp.html),
|
55
|
+
but this library uses standard [Ruby
|
56
|
+
regexp](https://ruby-doc.org/core-2.7.0/Regexp.html). Please note:
|
57
|
+
|
58
|
+
- Not all ICU syntax is supported
|
59
|
+
- For supported syntax, in some cases the meaning of a regex may differ when
|
60
|
+
interpreted as Ruby regexp
|
61
|
+
- The following ICU syntax is supported through translation to Ruby syntax:
|
62
|
+
- `\x{hhhh}` → `\u{hhhh}`
|
63
|
+
- `\0ooo` → `\u{hhhh}`
|
64
|
+
|
51
65
|
## Installation
|
52
66
|
|
53
67
|
Add this line to your application's Gemfile:
|
@@ -96,7 +110,7 @@ input = 'foo <bar baz="a. b."> bazinga'
|
|
96
110
|
Srx::Engine.new(Data.default).segment(input, language: 'en')
|
97
111
|
#=> ["foo <bar baz=\"a.", " b.\"> bazinga"]
|
98
112
|
|
99
|
-
Srx::Engine.new(
|
113
|
+
Srx::Engine.new(Data.default, format: :xml).segment(input, language: 'en')
|
100
114
|
#=> ["foo <bar baz=\"a. b.\"> bazinga"]
|
101
115
|
```
|
102
116
|
|
data/lib/srx/engine.rb
CHANGED
@@ -7,7 +7,7 @@ module Srx
|
|
7
7
|
attr_reader :data
|
8
8
|
|
9
9
|
# @param data [Data]
|
10
|
-
# @param
|
10
|
+
# @param format [Symbol] see {Format#get}
|
11
11
|
def initialize(data, format: :text)
|
12
12
|
@data = data
|
13
13
|
@format = Format.get(format)
|
@@ -31,6 +31,8 @@ module Srx
|
|
31
31
|
results
|
32
32
|
end
|
33
33
|
|
34
|
+
private
|
35
|
+
|
34
36
|
# @param language [String]
|
35
37
|
# @return [Array<Data::Rule>]
|
36
38
|
def rules(language)
|
@@ -58,10 +60,10 @@ module Srx
|
|
58
60
|
# @param str [String]
|
59
61
|
# @param pos [Integer] the position to start searching from
|
60
62
|
# @param rules [Array<Data::LanguageRule::Rule>]
|
61
|
-
# @return [Array(Integer,Data::LanguageRule::Rule)] an array of
|
62
|
-
# position of a break, and 2) the rule that matched at that
|
63
|
-
# that the final break will always be at the end of the
|
64
|
-
# have an associated rule.
|
63
|
+
# @return [Array<Array(Integer,Data::LanguageRule::Rule)>] an array of pairs
|
64
|
+
# of 1) the position of a break, and 2) the rule that matched at that
|
65
|
+
# position. Note that the final break will always be at the end of the
|
66
|
+
# string and may not have an associated rule.
|
65
67
|
def breaks_by_pos(str, rules)
|
66
68
|
rules
|
67
69
|
.flat_map { |rule| all_matches(str, rule) }
|
data/lib/srx/format.rb
CHANGED
@@ -15,7 +15,7 @@ module Srx
|
|
15
15
|
}.freeze
|
16
16
|
|
17
17
|
class << self
|
18
|
-
# @param format [Symbol]
|
18
|
+
# @param format [Symbol] see keys of {FORMATS} for accepted values
|
19
19
|
# @return [BaseFormat]
|
20
20
|
def get(format)
|
21
21
|
raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
|
data/lib/srx/icu_regex.rb
CHANGED
@@ -3,13 +3,20 @@
|
|
3
3
|
module Srx
|
4
4
|
# Utilities for handling SRX (ICU) regular expressions
|
5
5
|
module IcuRegex
|
6
|
-
HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex
|
6
|
+
HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>\{[a-f0-9]{1,6}\})/i.freeze
|
7
|
+
OCTAL_PATTERN = /(?<!\\)(?:\\\\)*\\0(?<oct>[0-7]{1,3})/i.freeze
|
7
8
|
|
8
9
|
class << self
|
9
10
|
# @param icu_regex [String]
|
10
11
|
# @return [String]
|
11
12
|
def to_ruby(icu_regex)
|
12
|
-
icu_regex.
|
13
|
+
result = icu_regex.dup
|
14
|
+
result.gsub!(HEX_PATTERN, '\u\k<hex>')
|
15
|
+
result.gsub!(OCTAL_PATTERN) do |m|
|
16
|
+
$LAST_MATCH_INFO['oct'].to_i(8).then { |o| o <= 255 ? format(%q(\u{%x}), o) : m }
|
17
|
+
end
|
18
|
+
|
19
|
+
result
|
13
20
|
end
|
14
21
|
|
15
22
|
# @param icu_regex [String]
|
data/lib/srx/srx-20-sample.srx
CHANGED
@@ -68,7 +68,7 @@
|
|
68
68
|
\xff01: Fullwidth exclamation mark
|
69
69
|
-->
|
70
70
|
<rule break="yes">
|
71
|
-
<beforebreak>[\
|
71
|
+
<beforebreak>[\x{ff61}\x{3002}\x{ff0e}\x{ff1f}\x{ff01}]+</beforebreak>
|
72
72
|
<afterbreak></afterbreak>
|
73
73
|
</rule>
|
74
74
|
</languagerule>
|
data/lib/srx/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: srx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|