srx 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 248fc2b9cd220023564c26a792536b1593b2bdefe77b71e133805658bd169d13
4
- data.tar.gz: 7aa58a5e7fa2255219aaf83f7df1b09ebde3711a59c84d0cce7b3811af6f869e
3
+ metadata.gz: c28a49fd42454ec9968d3a0c1066af93c14ec4ebcd68fc5dccb43b24c0cf13c7
4
+ data.tar.gz: 268e0beffe2d9a2a9b189b9a5ce649cc026ed066d3dc168767db9d99ccdc174d
5
5
  SHA512:
6
- metadata.gz: 0af9311397924bf58131fd753e879f63f32c3706998f5cbb9c3c06ab51cb626a2417d31cd52dba2d905b571d5489efc1ad088a334f7b868fdd66e8e82a64f547
7
- data.tar.gz: 969da7b926a664411a887948f17ff011d8e4040ef970e71d2ecde92ff05e39039aeb2a51238326be83418fda7bdf279241145c024cf41bccc8d4d515027cc039
6
+ metadata.gz: 6c7edc8f3fc9895e434b20551fc1dbccb82731e6500302375db3a3ffce730fe96a78163d662da637c1455449cb459abf751f616ba38e77d4594258611831b503
7
+ data.tar.gz: 2e0a72a250e180ba909ac2aa4c31c0ec85bca10c4cf40979910c9fb4123e47154402b7e554dbf62a7095e623a4ac6c29188f89d6a1ee2dfce9d8a7a1c978fa12
data/.rubocop_todo.yml CHANGED
@@ -1,15 +1,15 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2021-02-08 14:52:03 UTC using RuboCop version 1.9.1.
3
+ # on 2021-02-15 14:53:27 UTC using RuboCop version 1.9.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 3
9
+ # Offense count: 6
10
10
  # Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
11
11
  Metrics/AbcSize:
12
- Max: 24
12
+ Max: 26
13
13
 
14
14
  # Offense count: 6
15
15
  # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
@@ -17,17 +17,17 @@ Metrics/AbcSize:
17
17
  Metrics/BlockLength:
18
18
  Max: 269
19
19
 
20
- # Offense count: 1
20
+ # Offense count: 2
21
21
  # Configuration parameters: IgnoredMethods.
22
22
  Metrics/CyclomaticComplexity:
23
23
  Max: 9
24
24
 
25
- # Offense count: 3
25
+ # Offense count: 9
26
26
  # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
27
27
  Metrics/MethodLength:
28
28
  Max: 25
29
29
 
30
- # Offense count: 1
30
+ # Offense count: 2
31
31
  # Configuration parameters: IgnoredMethods.
32
32
  Metrics/PerceivedComplexity:
33
33
  Max: 10
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.3.0] - 2021-02-16
4
+
5
+ - All `Srx::Engine` methods except `#segment` are now private
6
+ - ICU regex syntax `\xhhhh` is now no longer converted to Ruby regex, as this
7
+ syntax was not correct; it now must be `\x{hhhh}`
8
+ - ICU regex syntax `\0ooo` is now supported
9
+
3
10
  ## [0.2.0] - 2021-02-13
4
11
 
5
12
  - Handle HTML void elements correctly
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- srx (0.2.0)
4
+ srx (0.3.0)
5
5
  nokogiri (~> 1.11)
6
6
 
7
7
  GEM
@@ -36,7 +36,7 @@ GEM
36
36
  diff-lcs (>= 1.2.0, < 2.0)
37
37
  rspec-support (~> 3.10.0)
38
38
  rspec-support (3.10.2)
39
- rubocop (1.9.1)
39
+ rubocop (1.10.0)
40
40
  parallel (~> 1.10)
41
41
  parser (>= 3.0.0.0)
42
42
  rainbow (>= 2.2.2, < 4.0)
data/README.md CHANGED
@@ -48,6 +48,20 @@ Some disadvantages:
48
48
  improve on that with better rules such as
49
49
  [LanguageTool's](https://github.com/amake/srx-languagetool-ruby).
50
50
 
51
+ ## Caveats
52
+
53
+ The SRX spec calls for [ICU regular
54
+ expressions](https://unicode-org.github.io/icu/userguide/strings/regexp.html),
55
+ but this library uses standard [Ruby
56
+ regexp](https://ruby-doc.org/core-2.7.0/Regexp.html). Please note:
57
+
58
+ - Not all ICU syntax is supported
59
+ - For supported syntax, in some cases the meaning of a regex may differ when
60
+ interpreted as Ruby regexp
61
+ - The following ICU syntax is supported through translation to Ruby syntax:
62
+ - `\x{hhhh}` → `\u{hhhh}`
63
+ - `\0ooo` → `\u{hhhh}`
64
+
51
65
  ## Installation
52
66
 
53
67
  Add this line to your application's Gemfile:
@@ -96,7 +110,7 @@ input = 'foo <bar baz="a. b."> bazinga'
96
110
  Srx::Engine.new(Data.default).segment(input, language: 'en')
97
111
  #=> ["foo <bar baz=\"a.", " b.\"> bazinga"]
98
112
 
99
- Srx::Engine.new(data, format: :xml).segment(input, language: 'en')
113
+ Srx::Engine.new(Data.default, format: :xml).segment(input, language: 'en')
100
114
  #=> ["foo <bar baz=\"a. b.\"> bazinga"]
101
115
  ```
102
116
 
data/lib/srx/engine.rb CHANGED
@@ -7,7 +7,7 @@ module Srx
7
7
  attr_reader :data
8
8
 
9
9
  # @param data [Data]
10
- # @param markup [Regexp]
10
+ # @param format [Symbol] see {Format#get}
11
11
  def initialize(data, format: :text)
12
12
  @data = data
13
13
  @format = Format.get(format)
@@ -31,6 +31,8 @@ module Srx
31
31
  results
32
32
  end
33
33
 
34
+ private
35
+
34
36
  # @param language [String]
35
37
  # @return [Array<Data::Rule>]
36
38
  def rules(language)
@@ -58,10 +60,10 @@ module Srx
58
60
  # @param str [String]
59
61
  # @param pos [Integer] the position to start searching from
60
62
  # @param rules [Array<Data::LanguageRule::Rule>]
61
- # @return [Array(Integer,Data::LanguageRule::Rule)] an array of 1) the
62
- # position of a break, and 2) the rule that matched at that position. Note
63
- # that the final break will always be at the end of the string and may not
64
- # have an associated rule.
63
+ # @return [Array<Array(Integer,Data::LanguageRule::Rule)>] an array of pairs
64
+ # of 1) the position of a break, and 2) the rule that matched at that
65
+ # position. Note that the final break will always be at the end of the
66
+ # string and may not have an associated rule.
65
67
  def breaks_by_pos(str, rules)
66
68
  rules
67
69
  .flat_map { |rule| all_matches(str, rule) }
data/lib/srx/format.rb CHANGED
@@ -15,7 +15,7 @@ module Srx
15
15
  }.freeze
16
16
 
17
17
  class << self
18
- # @param format [Symbol]
18
+ # @param format [Symbol] see keys of {FORMATS} for accepted values
19
19
  # @return [BaseFormat]
20
20
  def get(format)
21
21
  raise(ArgumentError, "Unknown format: #{format}") unless FORMATS.key?(format)
data/lib/srx/icu_regex.rb CHANGED
@@ -3,13 +3,20 @@
3
3
  module Srx
4
4
  # Utilities for handling SRX (ICU) regular expressions
5
5
  module IcuRegex
6
- HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>[a-f0-9]{4}|\{[a-f0-9]{4}\})/i.freeze
6
+ HEX_PATTERN = /(?<!\\)(?:\\\\)*\\x(?<hex>\{[a-f0-9]{1,6}\})/i.freeze
7
+ OCTAL_PATTERN = /(?<!\\)(?:\\\\)*\\0(?<oct>[0-7]{1,3})/i.freeze
7
8
 
8
9
  class << self
9
10
  # @param icu_regex [String]
10
11
  # @return [String]
11
12
  def to_ruby(icu_regex)
12
- icu_regex.gsub(HEX_PATTERN, '\u\k<hex>')
13
+ result = icu_regex.dup
14
+ result.gsub!(HEX_PATTERN, '\u\k<hex>')
15
+ result.gsub!(OCTAL_PATTERN) do |m|
16
+ $LAST_MATCH_INFO['oct'].to_i(8).then { |o| o <= 255 ? format(%q(\u{%x}), o) : m }
17
+ end
18
+
19
+ result
13
20
  end
14
21
 
15
22
  # @param icu_regex [String]
@@ -68,7 +68,7 @@
68
68
  \xff01: Fullwidth exclamation mark
69
69
  -->
70
70
  <rule break="yes">
71
- <beforebreak>[\xff61\x3002\xff0e\xff1f\xff01]+</beforebreak>
71
+ <beforebreak>[\x{ff61}\x{3002}\x{ff0e}\x{ff1f}\x{ff01}]+</beforebreak>
72
72
  <afterbreak></afterbreak>
73
73
  </rule>
74
74
  </languagerule>
data/lib/srx/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Srx
4
- VERSION = '0.2.0'
4
+ VERSION = '0.3.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: srx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-02-13 00:00:00.000000000 Z
11
+ date: 2021-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri