srx 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 248fc2b9cd220023564c26a792536b1593b2bdefe77b71e133805658bd169d13
4
- data.tar.gz: 7aa58a5e7fa2255219aaf83f7df1b09ebde3711a59c84d0cce7b3811af6f869e
3
+ metadata.gz: c28a49fd42454ec9968d3a0c1066af93c14ec4ebcd68fc5dccb43b24c0cf13c7
4
+ data.tar.gz: 268e0beffe2d9a2a9b189b9a5ce649cc026ed066d3dc168767db9d99ccdc174d
5
5
  SHA512:
6
- metadata.gz: 0af9311397924bf58131fd753e879f63f32c3706998f5cbb9c3c06ab51cb626a2417d31cd52dba2d905b571d5489efc1ad088a334f7b868fdd66e8e82a64f547
7
- data.tar.gz: 969da7b926a664411a887948f17ff011d8e4040ef970e71d2ecde92ff05e39039aeb2a51238326be83418fda7bdf279241145c024cf41bccc8d4d515027cc039
6
+ metadata.gz: 6c7edc8f3fc9895e434b20551fc1dbccb82731e6500302375db3a3ffce730fe96a78163d662da637c1455449cb459abf751f616ba38e77d4594258611831b503
7
+ data.tar.gz: 2e0a72a250e180ba909ac2aa4c31c0ec85bca10c4cf40979910c9fb4123e47154402b7e554dbf62a7095e623a4ac6c29188f89d6a1ee2dfce9d8a7a1c978fa12
data/.rubocop_todo.yml CHANGED
@@ -1,15 +1,15 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2021-02-08 14:52:03 UTC using RuboCop version 1.9.1.
3
+ # on 2021-02-15 14:53:27 UTC using RuboCop version 1.9.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 3
9
+ # Offense count: 6
10
10
  # Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
11
11
  Metrics/AbcSize:
12
- Max: 24
12
+ Max: 26
13
13
 
14
14
  # Offense count: 6
15
15
  # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
@@ -17,17 +17,17 @@ Metrics/AbcSize:
17
17
  Metrics/BlockLength:
18
18
  Max: 269
19
19
 
20
- # Offense count: 1
20
+ # Offense count: 2
21
21
  # Configuration parameters: IgnoredMethods.
22
22
  Metrics/CyclomaticComplexity:
23
23
  Max: 9
24
24
 
25
- # Offense count: 3
25
+ # Offense count: 9
26
26
  # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
27
27
  Metrics/MethodLength:
28
28
  Max: 25
29
29
 
30
- # Offense count: 1
30
+ # Offense count: 2
31
31
  # Configuration parameters: IgnoredMethods.
32
32
  Metrics/PerceivedComplexity:
33
33
  Max: 10
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.3.0] - 2021-02-16
4
+
5
+ - All `Srx::Engine` methods except `#segment` are now private
6
+ - ICU regex syntax `\xhhhh` is now no longer converted to Ruby regex, as this
7
+ syntax was not correct; it now must be `\x{hhhh}`
8
+ - ICU regex syntax `\0ooo` is now supported
9
+
3
10
  ## [0.2.0] - 2021-02-13
4
11
 
5
12
  - Handle HTML void elements correctly
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- srx (0.2.0)
4
+ srx (0.3.0)
5
5
  nokogiri (~> 1.11)
6
6
 
7
7
  GEM
@@ -36,7 +36,7 @@ GEM
36
36
  diff-lcs (>= 1.2.0, < 2.0)
37
37
  rspec-support (~> 3.10.0)
38
38
  rspec-support (3.10.2)
39
- rubocop (1.9.1)
39
+ rubocop (1.10.0)
40
40
  parallel (~> 1.10)
41
41
  parser (>= 3.0.0.0)
42
42
  rainbow (>= 2.2.2, < 4.0)
data/README.md CHANGED
@@ -48,6 +48,20 @@ Some disadvantages:
48
48
  improve on that with better rules such as
49
49
  [LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
50
50
 
51
+ ## Caveats
52
+
53
+ The SRX spec calls for [ICU regular
54
+ expressions](https://unicode-org.github.io/icu/userguide/strings/regexp.html),
55
+ but this library uses standard [Ruby
56
+ regexp](https://ruby-doc.org/core-2.7.0/Regexp.html). Please note:
57
+
58
+ - Not all ICU syntax is supported
59
+ - For supported syntax, in some cases the meaning of a regex may differ when
60
+ interpreted as Ruby regexp
61
+ - The following ICU syntax is supported through translation to Ruby syntax:
62
+ - `\x{hhhh}` → `\u{hhhh}`
63
+ - `\0ooo` → `\u{hhhh}`
64
+
51
65
  ## Installation
52
66
 
53
67
  Add this line to your application's Gemfile:
@@ -96,7 +110,7 @@ input = 'foo <bar baz="a. b."> bazinga'
96
110
  Srx::Engine.new(Data.default).segment(input, language: 'en')
97
111
  #=> ["foo <bar baz=\"a.", " b.\"> bazinga"]
98
112
 
99
- Srx::Engine.new(data, format: :xml).segment(input, language: 'en')
113
+ Srx::Engine.new(Data.default, format: :xml).segment(input, language: 'en')
100
114
  #=> ["foo <bar baz=\"a. b.\"> bazinga"]
101
115
  ```
102
116
 
data/lib/srx/engine.rb CHANGED
@@ -7,7 +7,7 @@ module Srx
7
7
  attr_reader :data
8
8
 
9
9
  # @param data [Data]
10
- # @param markup [Regexp]
10
+ # @param format [Symbol] see {Format#get}
11
11
  def initialize(data, format: :text)
12
12
  @data = data
13
13
  @format = Format.get(format)
@@ -31,6 +31,8 @@ module Srx
31
31
  results
32
32
  end
33
33
 
34
+ private
35
+
34
36
  # @param language [String]
35
37
  # @return [Array<Data::Rule>]
36
38
  def rules(language)
@@ -58,10 +60,10 @@ module Srx
58
60
  # @param str [String]
59
61
  # @param pos [Integer] the position to start searching from
60
62
  # @param rules [Array<Data::LanguageRule::Rule>]
61
- # @return [Array(Integer,Data::LanguageRule::Rule)] an array of 1) the
62
- # position of a break, and 2) the rule that matched at that position. Note
63
- # that the final break will always be at the end of the string and may not
64
- # have an associated rule.
63
+ # @return [Array<Array(Integer,Data::LanguageRule::Rule)>] an array of pairs
64
+ # of 1) the position of a break, and 2) the rule that matched at that
65
+ # position. Note that the final break will always be at the end of the
66
+ # string and may not have an associated rule.
65
67
  def breaks_by_pos(str, rules)
66
68
  rules
67
69
  .flat_map { |rule| all_matches(str, rule) }
data/lib/srx/format.rb CHANGED
@@ -15,7 +15,7 @@ module Srx
15
15
  }.freeze
16
16
 
17
17
  class << self
18
- # @param format [Symbol]
18
+ # @param format [Symbol] see keys of {FORMATS} for accepted values
19
19
  # @return [BaseFormat]
20
20
  def get(format)
21
21
  raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
data/lib/srx/icu_regex.rb CHANGED
@@ -3,13 +3,20 @@
3
3
  module Srx
4
4
  # Utilities for handling SRX (ICU) regular expressions
5
5
  module IcuRegex
6
- HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>[a-f0-9]{4}|\{[a-f0-9]{4}\})/i.freeze
6
+ HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>\{[a-f0-9]{1,6}\})/i.freeze
7
+ OCTAL_PATTERN = /(?<!\\)(?:\\\\)*\\0(?<oct>[0-7]{1,3})/i.freeze
7
8
 
8
9
  class << self
9
10
  # @param icu_regex [String]
10
11
  # @return [String]
11
12
  def to_ruby(icu_regex)
12
- icu_regex.gsub(HEX_PATTERN, '\u\k<hex>')
13
+ result = icu_regex.dup
14
+ result.gsub!(HEX_PATTERN, '\u\k<hex>')
15
+ result.gsub!(OCTAL_PATTERN) do |m|
16
+ $LAST_MATCH_INFO['oct'].to_i(8).then { |o| o <= 255 ? format(%q(\u{%x}), o) : m }
17
+ end
18
+
19
+ result
13
20
  end
14
21
 
15
22
  # @param icu_regex [String]
@@ -68,7 +68,7 @@
68
68
  \xff01: Fullwidth exclamation mark
69
69
  -->
70
70
  <rule break="yes">
71
- <beforebreak>[\xff61\x3002\xff0e\xff1f\xff01]+</beforebreak>
71
+ <beforebreak>[\x{ff61}\x{3002}\x{ff0e}\x{ff1f}\x{ff01}]+</beforebreak>
72
72
  <afterbreak></afterbreak>
73
73
  </rule>
74
74
  </languagerule>
data/lib/srx/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Srx
4
- VERSION = '0.2.0'
4
+ VERSION = '0.3.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: srx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-02-13 00:00:00.000000000 Z
11
+ date: 2021-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri