srx 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.dir-locals.el +4 -0
- data/.github/workflows/main.yml +21 -0
- data/.gitignore +11 -0
- data/.rubocop.yml +13 -0
- data/.rubocop_todo.yml +33 -0
- data/.solargraph.yml +17 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +84 -0
- data/LICENSE.txt +21 -0
- data/README.md +120 -0
- data/Rakefile +16 -0
- data/bin/benchmark +94 -0
- data/bin/console +15 -0
- data/bin/profile +33 -0
- data/bin/segment +28 -0
- data/bin/setup +8 -0
- data/lib/srx.rb +13 -0
- data/lib/srx/data.rb +169 -0
- data/lib/srx/engine.rb +136 -0
- data/lib/srx/format.rb +26 -0
- data/lib/srx/format/base_format.rb +38 -0
- data/lib/srx/format/text.rb +12 -0
- data/lib/srx/format/xml.rb +53 -0
- data/lib/srx/icu_regex.rb +22 -0
- data/lib/srx/srx-20-sample.srx +86 -0
- data/lib/srx/util.rb +16 -0
- data/lib/srx/version.rb +5 -0
- data/srx.gemspec +37 -0
- metadata +185 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 18c2fc1da5f4a792393a9dd4955c79d73bc9d00943000f38689a5763d8c8ff4c
|
4
|
+
data.tar.gz: 2385b9f65e61ad291e1419e217b737653bbeea0d4f62dc7fbb4ae35e6948984c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 915b837e6a239f7de688d51ca3fca2bbdcf898234f825cbd49894223e8c31f71167d6a22d9097a36ab75ea3fc8f383ae790d3df8e0e3fce2fb8210022412fad2
|
7
|
+
data.tar.gz: 326d3de2c328aa49be07c7df62229fe40c8cf3b2c4101f090b9c805b9e3a2fcd1c8eec155b6b3af9c556ff5c548d5263425975bdc9b0368b334342ce47dd71d9
|
data/.dir-locals.el
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
name: Ruby
|
2
|
+
|
3
|
+
on: [push,pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
steps:
|
9
|
+
- uses: actions/checkout@v2
|
10
|
+
- name: Set up Ruby
|
11
|
+
uses: ruby/setup-ruby@v1
|
12
|
+
with:
|
13
|
+
ruby-version: 2.7.2
|
14
|
+
- name: Install
|
15
|
+
run: |
|
16
|
+
gem install bundler -v 2.2.7
|
17
|
+
bundle install
|
18
|
+
- name: Type check
|
19
|
+
run: bundle exec solargraph typecheck --level typed
|
20
|
+
- name: Unit tests
|
21
|
+
run: bundle exec rake
|
data/.gitignore
ADDED
data/.rubocop.yml
ADDED
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2021-02-08 14:52:03 UTC using RuboCop version 1.9.1.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 3
|
10
|
+
# Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Max: 24
|
13
|
+
|
14
|
+
# Offense count: 6
|
15
|
+
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
16
|
+
# IgnoredMethods: refine
|
17
|
+
Metrics/BlockLength:
|
18
|
+
Max: 269
|
19
|
+
|
20
|
+
# Offense count: 1
|
21
|
+
# Configuration parameters: IgnoredMethods.
|
22
|
+
Metrics/CyclomaticComplexity:
|
23
|
+
Max: 9
|
24
|
+
|
25
|
+
# Offense count: 3
|
26
|
+
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
27
|
+
Metrics/MethodLength:
|
28
|
+
Max: 25
|
29
|
+
|
30
|
+
# Offense count: 1
|
31
|
+
# Configuration parameters: IgnoredMethods.
|
32
|
+
Metrics/PerceivedComplexity:
|
33
|
+
Max: 10
|
data/.solargraph.yml
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
---
|
2
|
+
include:
|
3
|
+
- "**/*.rb"
|
4
|
+
exclude:
|
5
|
+
- spec/**/*
|
6
|
+
- test/**/*
|
7
|
+
- vendor/**/*
|
8
|
+
- ".bundle/**/*"
|
9
|
+
require: []
|
10
|
+
domains: []
|
11
|
+
reporters:
|
12
|
+
- rubocop
|
13
|
+
- require_not_found
|
14
|
+
- typecheck:typed
|
15
|
+
require_paths: []
|
16
|
+
plugins: []
|
17
|
+
max_files: 5000
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
srx (0.1.0)
|
5
|
+
nokogiri (~> 1.11)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
ast (2.4.2)
|
11
|
+
backport (1.1.2)
|
12
|
+
benchmark (0.1.1)
|
13
|
+
byebug (11.1.3)
|
14
|
+
diff-lcs (1.4.4)
|
15
|
+
e2mmap (0.1.0)
|
16
|
+
jaro_winkler (1.5.4)
|
17
|
+
kramdown (2.3.0)
|
18
|
+
rexml
|
19
|
+
kramdown-parser-gfm (1.1.0)
|
20
|
+
kramdown (~> 2.0)
|
21
|
+
memory_profiler (1.0.0)
|
22
|
+
minitest (5.14.3)
|
23
|
+
nokogiri (1.11.1-x86_64-darwin)
|
24
|
+
racc (~> 1.4)
|
25
|
+
parallel (1.20.1)
|
26
|
+
parser (3.0.0.0)
|
27
|
+
ast (~> 2.4.1)
|
28
|
+
racc (1.5.2)
|
29
|
+
rainbow (3.0.0)
|
30
|
+
rake (13.0.3)
|
31
|
+
regexp_parser (2.0.3)
|
32
|
+
reverse_markdown (2.0.0)
|
33
|
+
nokogiri
|
34
|
+
rexml (3.2.4)
|
35
|
+
rspec-expectations (3.10.1)
|
36
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
37
|
+
rspec-support (~> 3.10.0)
|
38
|
+
rspec-support (3.10.2)
|
39
|
+
rubocop (1.9.1)
|
40
|
+
parallel (~> 1.10)
|
41
|
+
parser (>= 3.0.0.0)
|
42
|
+
rainbow (>= 2.2.2, < 4.0)
|
43
|
+
regexp_parser (>= 1.8, < 3.0)
|
44
|
+
rexml
|
45
|
+
rubocop-ast (>= 1.2.0, < 2.0)
|
46
|
+
ruby-progressbar (~> 1.7)
|
47
|
+
unicode-display_width (>= 1.4.0, < 3.0)
|
48
|
+
rubocop-ast (1.4.1)
|
49
|
+
parser (>= 2.7.1.5)
|
50
|
+
ruby-progressbar (1.11.0)
|
51
|
+
solargraph (0.40.3)
|
52
|
+
backport (~> 1.1)
|
53
|
+
benchmark
|
54
|
+
bundler (>= 1.17.2)
|
55
|
+
e2mmap
|
56
|
+
jaro_winkler (~> 1.5)
|
57
|
+
kramdown (~> 2.3)
|
58
|
+
kramdown-parser-gfm (~> 1.1)
|
59
|
+
parser (~> 3.0)
|
60
|
+
reverse_markdown (>= 1.0.5, < 3)
|
61
|
+
rubocop (>= 0.52)
|
62
|
+
thor (~> 1.0)
|
63
|
+
tilt (~> 2.0)
|
64
|
+
yard (~> 0.9, >= 0.9.24)
|
65
|
+
thor (1.1.0)
|
66
|
+
tilt (2.0.10)
|
67
|
+
unicode-display_width (2.0.0)
|
68
|
+
yard (0.9.26)
|
69
|
+
|
70
|
+
PLATFORMS
|
71
|
+
x86_64-darwin-20
|
72
|
+
|
73
|
+
DEPENDENCIES
|
74
|
+
byebug
|
75
|
+
memory_profiler
|
76
|
+
minitest
|
77
|
+
rake
|
78
|
+
rspec-expectations
|
79
|
+
rubocop
|
80
|
+
solargraph
|
81
|
+
srx!
|
82
|
+
|
83
|
+
BUNDLED WITH
|
84
|
+
2.2.7
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2021 Aaron Madlon-Kay
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# SRX for Ruby
|
2
|
+
|
3
|
+
SRX is a specification for segmenting text, i.e. splitting text into sentences.
|
4
|
+
More specifically it is
|
5
|
+
|
6
|
+
- An XML-based format for specifying segmentation rules, and
|
7
|
+
- An algorithm by which the rules are applied
|
8
|
+
|
9
|
+
See the [SRX 2.0 Specification](http://www.ttt.org/oscarStandards/srx/srx20.html)
|
10
|
+
for full details.
|
11
|
+
|
12
|
+
This gem provides facilities for reading SRX files and an engine for performing
|
13
|
+
segmentation.
|
14
|
+
|
15
|
+
Only a minimal rule set is supplied by default; for actual usage you are
|
16
|
+
encouraged to supply your own SRX rules.
|
17
|
+
|
18
|
+
## What's different about this gem?
|
19
|
+
|
20
|
+
There are lots of good segmentation gems out there such as
|
21
|
+
|
22
|
+
- [pragmatic_segmenter](https://github.com/diasks2/pragmatic_segmenter)
|
23
|
+
- [TactfulTokenizer](https://github.com/zencephalon/Tactful_Tokenizer)
|
24
|
+
- [Punkt](https://github.com/lfcipriani/punkt-segmenter)
|
25
|
+
|
26
|
+
What makes SRX different is:
|
27
|
+
|
28
|
+
- It allows easy customization and exchange of rules via SRX files
|
29
|
+
- It preserves whitespace surrounding break points
|
30
|
+
- It offers advanced XML/HTML tag handling: it won't be fooled by false breaks
|
31
|
+
in e.g. attribute values
|
32
|
+
|
33
|
+
Some other advantages that are not unique to SRX:
|
34
|
+
|
35
|
+
- It is offered under a very permissive license
|
36
|
+
- It is relatively lightweight as a dependency
|
37
|
+
- It is fast (though this depends somewhat on the ruleset you use)
|
38
|
+
|
39
|
+
Some disadvantages:
|
40
|
+
|
41
|
+
- It is inherently rule-based, with all of the weaknesses that implies
|
42
|
+
- It is not very accurate on the [Golden Rules
|
43
|
+
test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
|
44
|
+
scoring 47% (English) and 48% (others) with the default rules. However you can
|
45
|
+
improve on that with better rules such as
|
46
|
+
[LanguageTool's](https://github.com/languagetool-org/languagetool/blob/05707300df14668e97d064811931e0668f2b695b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx).
|
47
|
+
|
48
|
+
## Installation
|
49
|
+
|
50
|
+
Add this line to your application's Gemfile:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
gem 'srx'
|
54
|
+
```
|
55
|
+
|
56
|
+
And then execute:
|
57
|
+
|
58
|
+
$ bundle install
|
59
|
+
|
60
|
+
Or install it yourself as:
|
61
|
+
|
62
|
+
$ gem install srx
|
63
|
+
|
64
|
+
## Usage
|
65
|
+
|
66
|
+
Use the default rules like so. Specify the language according the `<maprules>`
|
67
|
+
of your SRX (usually two-letter [ISO 639-1
|
68
|
+
codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)).
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
require 'srx'
|
72
|
+
|
73
|
+
data = Srx::Data.default
|
74
|
+
engine = Srx::Engine.new(data)
|
75
|
+
engine.segment('Hi. How are you?', language: 'en') #=> ["Hi.", " How are you?"]
|
76
|
+
```
|
77
|
+
|
78
|
+
Or bring your own rules:
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
data = Srx::Data.from_file(path: 'path/to/my/rules.srx')
|
82
|
+
engine = Srx::Engine.new(data)
|
83
|
+
```
|
84
|
+
|
85
|
+
Specify the format as `:xml` or `:html` to benefit from special handling of
|
86
|
+
tags:
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
# This should only be one segment, but handling as plain text incorrectly
|
90
|
+
# produces two segments.
|
91
|
+
input = 'foo <bar baz="a. b."> bazinga'
|
92
|
+
|
93
|
+
Srx::Engine.new(Data.default).segment(input, language: 'en')
|
94
|
+
#=> ["foo <bar baz=\"a.", " b.\"> bazinga"]
|
95
|
+
|
96
|
+
Srx::Engine.new(data, format: :xml).segment(input, language: 'en')
|
97
|
+
#=> ["foo <bar baz=\"a. b.\"> bazinga"]
|
98
|
+
```
|
99
|
+
|
100
|
+
## Development
|
101
|
+
|
102
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
103
|
+
`rake test` to run the tests. You can also run `bin/console` for an interactive
|
104
|
+
prompt that will allow you to experiment.
|
105
|
+
|
106
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To
|
107
|
+
release a new version, update the version number in `version.rb`, and then run
|
108
|
+
`bundle exec rake release`, which will create a git tag for the version, push
|
109
|
+
git commits and the created tag, and push the `.gem` file to
|
110
|
+
[rubygems.org](https://rubygems.org).
|
111
|
+
|
112
|
+
## Contributing
|
113
|
+
|
114
|
+
Bug reports and pull requests are welcome on GitHub at
|
115
|
+
https://github.com/amake/srx.
|
116
|
+
|
117
|
+
## License
|
118
|
+
|
119
|
+
The gem is available as open source under the terms of the [MIT
|
120
|
+
License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rake/testtask'
|
5
|
+
|
6
|
+
Rake::TestTask.new(:test) do |t|
|
7
|
+
t.libs << 'test'
|
8
|
+
t.libs << 'lib'
|
9
|
+
t.test_files = FileList['test/**/*_test.rb']
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'rubocop/rake_task'
|
13
|
+
|
14
|
+
RuboCop::RakeTask.new
|
15
|
+
|
16
|
+
task default: %i[test rubocop]
|
data/bin/benchmark
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'srx'
|
6
|
+
require 'optparse'
|
7
|
+
require 'benchmark'
|
8
|
+
|
9
|
+
options = {}
|
10
|
+
OptionParser.new do |opts|
|
11
|
+
opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
12
|
+
|
13
|
+
opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
|
14
|
+
opts.on('-fFORMAT', '--format FORMAT', 'Format of input text (default: text)')
|
15
|
+
end.parse!(into: options)
|
16
|
+
|
17
|
+
data = if options[:srx]
|
18
|
+
Srx::Data.from_file(path: options[:srx])
|
19
|
+
else
|
20
|
+
Srx::Data.default
|
21
|
+
end
|
22
|
+
format = options[:format]&.to_sym || :text
|
23
|
+
engine = Srx::Engine.new(data, format: format)
|
24
|
+
|
25
|
+
license_text = File.open(File.expand_path('../LICENSE.txt', __dir__), &:read).strip.then { |t| Srx::Util.unwrap(t) }
|
26
|
+
|
27
|
+
# Golden Rules speed test; see
|
28
|
+
# https://github.com/diasks2/pragmatic_segmenter#speed-performance-benchmarks
|
29
|
+
gold_text = Srx::Util.unwrap(<<~TXT)
|
30
|
+
Hello World. My name is Jonas.
|
31
|
+
What is your name? My name is Jonas.
|
32
|
+
There it is! I found it.
|
33
|
+
My name is Jonas E. Smith.
|
34
|
+
Please turn to p. 55.
|
35
|
+
Were Jane and co. at the party?
|
36
|
+
They closed the deal with Pitt, Briggs & Co. at noon.
|
37
|
+
Let's ask Jane and co. They should know.
|
38
|
+
They closed the deal with Pitt, Briggs & Co. It closed yesterday.
|
39
|
+
I can see Mt. Fuji from here.
|
40
|
+
St. Michael's Church is on 5th st. near the light.
|
41
|
+
That is JFK Jr.'s book.
|
42
|
+
I visited the U.S.A. last year.
|
43
|
+
I live in the E.U. How about you?
|
44
|
+
I live in the U.S. How about you?
|
45
|
+
I work for the U.S. Government in Virginia.
|
46
|
+
I have lived in the U.S. for 20 years.
|
47
|
+
At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.
|
48
|
+
She has $100.00 in her bag.
|
49
|
+
She has $100.00. It is in her bag.
|
50
|
+
He teaches science (He previously worked for 5 years as an engineer.) at the local University.
|
51
|
+
Her email is Jane.Doe@example.com. I sent her an email.
|
52
|
+
The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.
|
53
|
+
She turned to him, 'This is great.' she said.
|
54
|
+
She turned to him, "This is great." she said.
|
55
|
+
She turned to him, "This is great." She held the book out to show him.
|
56
|
+
Hello!! Long time no see.
|
57
|
+
Hello?? Who is there?
|
58
|
+
Hello!? Is that you?
|
59
|
+
Hello?! Is that you?
|
60
|
+
1.) The first item 2.) The second item
|
61
|
+
1.) The first item. 2.) The second item.
|
62
|
+
1) The first item 2) The second item
|
63
|
+
1) The first item. 2) The second item.
|
64
|
+
1. The first item 2. The second item
|
65
|
+
1. The first item. 2. The second item.
|
66
|
+
• 9. The first item • 10. The second item
|
67
|
+
⁃9. The first item ⁃10. The second item
|
68
|
+
a. The first item b. The second item c. The third list item
|
69
|
+
This is a sentence
|
70
|
+
cut off in the middle because pdf.
|
71
|
+
It was a cold#{' '}
|
72
|
+
night in the city.
|
73
|
+
features
|
74
|
+
contact manager
|
75
|
+
events, activities
|
76
|
+
|
77
|
+
You can find it at N°. 1026.253.553. That is where the treasure is.
|
78
|
+
She works at Yahoo! in the accounting department.
|
79
|
+
We make a good team, you and I. Did you see Albert I. Jones yesterday?
|
80
|
+
Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”
|
81
|
+
"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).
|
82
|
+
If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.
|
83
|
+
I never meant that.... She left the store.
|
84
|
+
I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.
|
85
|
+
One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .
|
86
|
+
TXT
|
87
|
+
|
88
|
+
n = 100
|
89
|
+
|
90
|
+
Benchmark.bm do |x|
|
91
|
+
x.report('LICENSE.txt (en)') { n.times { engine.segment(license_text, language: 'en') } }
|
92
|
+
x.report('LICENSE.txt (zz)') { n.times { engine.segment(license_text, language: 'zz') } }
|
93
|
+
x.report('Golden Rules (en)') { n.times { engine.segment(gold_text, language: 'en') } }
|
94
|
+
end
|