srx 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +6 -6
- data/CHANGELOG.md +7 -0
- data/Gemfile.lock +2 -2
- data/README.md +15 -1
- data/lib/srx/engine.rb +7 -5
- data/lib/srx/format.rb +1 -1
- data/lib/srx/icu_regex.rb +9 -2
- data/lib/srx/srx-20-sample.srx +1 -1
- data/lib/srx/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c28a49fd42454ec9968d3a0c1066af93c14ec4ebcd68fc5dccb43b24c0cf13c7
|
4
|
+
data.tar.gz: 268e0beffe2d9a2a9b189b9a5ce649cc026ed066d3dc168767db9d99ccdc174d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c7edc8f3fc9895e434b20551fc1dbccb82731e6500302375db3a3ffce730fe96a78163d662da637c1455449cb459abf751f616ba38e77d4594258611831b503
|
7
|
+
data.tar.gz: 2e0a72a250e180ba909ac2aa4c31c0ec85bca10c4cf40979910c9fb4123e47154402b7e554dbf62a7095e623a4ac6c29188f89d6a1ee2dfce9d8a7a1c978fa12
|
data/.rubocop_todo.yml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2021-02-
|
3
|
+
# on 2021-02-15 14:53:27 UTC using RuboCop version 1.9.1.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
-
# Offense count:
|
9
|
+
# Offense count: 6
|
10
10
|
# Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
|
11
11
|
Metrics/AbcSize:
|
12
|
-
Max:
|
12
|
+
Max: 26
|
13
13
|
|
14
14
|
# Offense count: 6
|
15
15
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
@@ -17,17 +17,17 @@ Metrics/AbcSize:
|
|
17
17
|
Metrics/BlockLength:
|
18
18
|
Max: 269
|
19
19
|
|
20
|
-
# Offense count:
|
20
|
+
# Offense count: 2
|
21
21
|
# Configuration parameters: IgnoredMethods.
|
22
22
|
Metrics/CyclomaticComplexity:
|
23
23
|
Max: 9
|
24
24
|
|
25
|
-
# Offense count:
|
25
|
+
# Offense count: 9
|
26
26
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
27
27
|
Metrics/MethodLength:
|
28
28
|
Max: 25
|
29
29
|
|
30
|
-
# Offense count:
|
30
|
+
# Offense count: 2
|
31
31
|
# Configuration parameters: IgnoredMethods.
|
32
32
|
Metrics/PerceivedComplexity:
|
33
33
|
Max: 10
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.3.0] - 2021-02-16
|
4
|
+
|
5
|
+
- All `Srx::Engine` methods except `#segment` are now private
|
6
|
+
- ICU regex syntax `\xhhhh` is now no longer converted to Ruby regex, as this
|
7
|
+
syntax was not correct; it now must be `\x{hhhh}`
|
8
|
+
- ICU regex syntax `\0ooo` is now supported
|
9
|
+
|
3
10
|
## [0.2.0] - 2021-02-13
|
4
11
|
|
5
12
|
- Handle HTML void elements correctly
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
srx (0.
|
4
|
+
srx (0.3.0)
|
5
5
|
nokogiri (~> 1.11)
|
6
6
|
|
7
7
|
GEM
|
@@ -36,7 +36,7 @@ GEM
|
|
36
36
|
diff-lcs (>= 1.2.0, < 2.0)
|
37
37
|
rspec-support (~> 3.10.0)
|
38
38
|
rspec-support (3.10.2)
|
39
|
-
rubocop (1.
|
39
|
+
rubocop (1.10.0)
|
40
40
|
parallel (~> 1.10)
|
41
41
|
parser (>= 3.0.0.0)
|
42
42
|
rainbow (>= 2.2.2, < 4.0)
|
data/README.md
CHANGED
@@ -48,6 +48,20 @@ Some disadvantages:
|
|
48
48
|
improve on that with better rules such as
|
49
49
|
[LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
|
50
50
|
|
51
|
+
## Caveats
|
52
|
+
|
53
|
+
The SRX spec calls for [ICU regular
|
54
|
+
expressions](https://unicode-org.github.io/icu/userguide/strings/regexp.html),
|
55
|
+
but this library uses standard [Ruby
|
56
|
+
regexp](https://ruby-doc.org/core-2.7.0/Regexp.html). Please note:
|
57
|
+
|
58
|
+
- Not all ICU syntax is supported
|
59
|
+
- For supported syntax, in some cases the meaning of a regex may differ when
|
60
|
+
interpreted as Ruby regexp
|
61
|
+
- The following ICU syntax is supported through translation to Ruby syntax:
|
62
|
+
- `\x{hhhh}` → `\u{hhhh}`
|
63
|
+
- `\0ooo` → `\u{hhhh}`
|
64
|
+
|
51
65
|
## Installation
|
52
66
|
|
53
67
|
Add this line to your application's Gemfile:
|
@@ -96,7 +110,7 @@ input = 'foo <bar baz="a. b."> bazinga'
|
|
96
110
|
Srx::Engine.new(Data.default).segment(input, language: 'en')
|
97
111
|
#=> ["foo <bar baz=\"a.", " b.\"> bazinga"]
|
98
112
|
|
99
|
-
Srx::Engine.new(
|
113
|
+
Srx::Engine.new(Data.default, format: :xml).segment(input, language: 'en')
|
100
114
|
#=> ["foo <bar baz=\"a. b.\"> bazinga"]
|
101
115
|
```
|
102
116
|
|
data/lib/srx/engine.rb
CHANGED
@@ -7,7 +7,7 @@ module Srx
|
|
7
7
|
attr_reader :data
|
8
8
|
|
9
9
|
# @param data [Data]
|
10
|
-
# @param
|
10
|
+
# @param format [Symbol] see {Format#get}
|
11
11
|
def initialize(data, format: :text)
|
12
12
|
@data = data
|
13
13
|
@format = Format.get(format)
|
@@ -31,6 +31,8 @@ module Srx
|
|
31
31
|
results
|
32
32
|
end
|
33
33
|
|
34
|
+
private
|
35
|
+
|
34
36
|
# @param language [String]
|
35
37
|
# @return [Array<Data::Rule>]
|
36
38
|
def rules(language)
|
@@ -58,10 +60,10 @@ module Srx
|
|
58
60
|
# @param str [String]
|
59
61
|
# @param pos [Integer] the position to start searching from
|
60
62
|
# @param rules [Array<Data::LanguageRule::Rule>]
|
61
|
-
# @return [Array(Integer,Data::LanguageRule::Rule)] an array of
|
62
|
-
# position of a break, and 2) the rule that matched at that
|
63
|
-
# that the final break will always be at the end of the
|
64
|
-
# have an associated rule.
|
63
|
+
# @return [Array<Array(Integer,Data::LanguageRule::Rule)>] an array of pairs
|
64
|
+
# of 1) the position of a break, and 2) the rule that matched at that
|
65
|
+
# position. Note that the final break will always be at the end of the
|
66
|
+
# string and may not have an associated rule.
|
65
67
|
def breaks_by_pos(str, rules)
|
66
68
|
rules
|
67
69
|
.flat_map { |rule| all_matches(str, rule) }
|
data/lib/srx/format.rb
CHANGED
@@ -15,7 +15,7 @@ module Srx
|
|
15
15
|
}.freeze
|
16
16
|
|
17
17
|
class << self
|
18
|
-
# @param format [Symbol]
|
18
|
+
# @param format [Symbol] see keys of {FORMATS} for accepted values
|
19
19
|
# @return [BaseFormat]
|
20
20
|
def get(format)
|
21
21
|
raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
|
data/lib/srx/icu_regex.rb
CHANGED
@@ -3,13 +3,20 @@
|
|
3
3
|
module Srx
|
4
4
|
# Utilities for handling SRX (ICU) regular expressions
|
5
5
|
module IcuRegex
|
6
|
-
HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex
|
6
|
+
HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>\{[a-f0-9]{1,6}\})/i.freeze
|
7
|
+
OCTAL_PATTERN = /(?<!\\)(?:\\\\)*\\0(?<oct>[0-7]{1,3})/i.freeze
|
7
8
|
|
8
9
|
class << self
|
9
10
|
# @param icu_regex [String]
|
10
11
|
# @return [String]
|
11
12
|
def to_ruby(icu_regex)
|
12
|
-
icu_regex.
|
13
|
+
result = icu_regex.dup
|
14
|
+
result.gsub!(HEX_PATTERN, '\u\k<hex>')
|
15
|
+
result.gsub!(OCTAL_PATTERN) do |m|
|
16
|
+
$LAST_MATCH_INFO['oct'].to_i(8).then { |o| o <= 255 ? format(%q(\u{%x}), o) : m }
|
17
|
+
end
|
18
|
+
|
19
|
+
result
|
13
20
|
end
|
14
21
|
|
15
22
|
# @param icu_regex [String]
|
data/lib/srx/srx-20-sample.srx
CHANGED
@@ -68,7 +68,7 @@
|
|
68
68
|
\xff01: Fullwidth exclamation mark
|
69
69
|
-->
|
70
70
|
<rule break="yes">
|
71
|
-
<beforebreak>[\
|
71
|
+
<beforebreak>[\x{ff61}\x{3002}\x{ff0e}\x{ff1f}\x{ff01}]+</beforebreak>
|
72
72
|
<afterbreak></afterbreak>
|
73
73
|
</rule>
|
74
74
|
</languagerule>
|
data/lib/srx/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: srx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|