pragmatic_segmenter 0.3.23 → 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS +6 -1
- data/README.md +4 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +1 -1
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/lib/unicode.rb +5 -0
- data/pragmatic_segmenter.gemspec +0 -1
- metadata +4 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51ae71a6650fcd15671ac767d26ebe1315a9ea655d8fbf6e29ef9e4fa668fc93
|
4
|
+
data.tar.gz: 786246dc9e80872b423013fed2d69e0cba48cc7a7d5a693a3165b4cdf61fe00d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a830c5787a3818bc274b69aabd82bf5f837ba76c43921970c26a59f229d69bb24b698ff27389056ed6c536216edefdf4fa12338affbe883929b492065554af4c
|
7
|
+
data.tar.gz: f86cd6a66eaeb1890b5ddb2316d5ede734061b78212a490f8092bd20845cdb4dd47fac374972244785170ae266af21c566cbf59dd1a5667151ccd651269b72d8
|
data/NEWS
CHANGED
data/README.md
CHANGED
@@ -890,6 +890,10 @@ To test the relative performance of different segmentation tools and libraries I
|
|
890
890
|
**Version 0.3.23**
|
891
891
|
* Refactor for Ruby 3.0 compatibility
|
892
892
|
|
893
|
+
**Version 0.3.24**
|
894
|
+
* Fix catastrophic backtracking in regular expression for numerical references
|
895
|
+
* Remove unicode dependency
|
896
|
+
|
893
897
|
## Contributing
|
894
898
|
|
895
899
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -47,7 +47,7 @@ module PragmaticSegmenter
|
|
47
47
|
# Rubular: http://rubular.com/r/mQ8Es9bxtk
|
48
48
|
CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
|
49
49
|
|
50
|
-
NUMBERED_REFERENCE_REGEX = /(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)
|
50
|
+
NUMBERED_REFERENCE_REGEX = /(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)?\b\d{1,3}\])+|((\d{1,3}\s?){0,3}\d{1,3}))(\s)(?=[A-Z])/
|
51
51
|
|
52
52
|
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
53
53
|
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
data/lib/unicode.rb
ADDED
data/pragmatic_segmenter.gemspec
CHANGED
@@ -18,7 +18,6 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_runtime_dependency "unicode"
|
22
21
|
spec.add_development_dependency "bundler", ">= 1.7"
|
23
22
|
spec.add_development_dependency "rake", ">= 12.3.3"
|
24
23
|
spec.add_development_dependency "rspec"
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: unicode
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: bundler
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +122,7 @@ files:
|
|
136
122
|
- lib/pragmatic_segmenter/segmenter.rb
|
137
123
|
- lib/pragmatic_segmenter/types.rb
|
138
124
|
- lib/pragmatic_segmenter/version.rb
|
125
|
+
- lib/unicode.rb
|
139
126
|
- pragmatic_segmenter.gemspec
|
140
127
|
- spec/performance_spec.rb
|
141
128
|
- spec/pragmatic_segmenter/languages/amharic_spec.rb
|
@@ -181,8 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
181
168
|
- !ruby/object:Gem::Version
|
182
169
|
version: '0'
|
183
170
|
requirements: []
|
184
|
-
|
185
|
-
rubygems_version: 2.7.6
|
171
|
+
rubygems_version: 3.3.26
|
186
172
|
signing_key:
|
187
173
|
specification_version: 4
|
188
174
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|