srx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 18c2fc1da5f4a792393a9dd4955c79d73bc9d00943000f38689a5763d8c8ff4c
4
+ data.tar.gz: 2385b9f65e61ad291e1419e217b737653bbeea0d4f62dc7fbb4ae35e6948984c
5
+ SHA512:
6
+ metadata.gz: 915b837e6a239f7de688d51ca3fca2bbdcf898234f825cbd49894223e8c31f71167d6a22d9097a36ab75ea3fc8f383ae790d3df8e0e3fce2fb8210022412fad2
7
+ data.tar.gz: 326d3de2c328aa49be07c7df62229fe40c8cf3b2c4101f090b9c805b9e3a2fcd1c8eec155b6b3af9c556ff5c548d5263425975bdc9b0368b334342ce47dd71d9
data/.dir-locals.el ADDED
@@ -0,0 +1,4 @@
1
+ ;;; Directory Local Variables
2
+ ;;; For more information see (info "(emacs) Directory Variables")
3
+
4
+ ((ruby-mode . ((lsp-solargraph-use-bundler . t))))
@@ -0,0 +1,21 @@
1
+ name: Ruby
2
+
3
+ on: [push,pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v2
10
+ - name: Set up Ruby
11
+ uses: ruby/setup-ruby@v1
12
+ with:
13
+ ruby-version: 2.7.2
14
+ - name: Install
15
+ run: |
16
+ gem install bundler -v 2.2.7
17
+ bundle install
18
+ - name: Type check
19
+ run: bundle exec solargraph typecheck --level typed
20
+ - name: Unit tests
21
+ run: bundle exec rake
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ /vendor
10
+
11
+ .byebug_history
data/.rubocop.yml ADDED
@@ -0,0 +1,13 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
3
+ AllCops:
4
+ TargetRubyVersion: 2.4
5
+ SuggestExtensions: false
6
+ NewCops: enable
7
+ Exclude:
8
+ - '**/*~'
9
+
10
+ Layout/LineLength:
11
+ Max: 120
12
+ Exclude:
13
+ - 'test/srx/golden_rules_test.rb'
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,33 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2021-02-08 14:52:03 UTC using RuboCop version 1.9.1.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 3
10
+ # Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
11
+ Metrics/AbcSize:
12
+ Max: 24
13
+
14
+ # Offense count: 6
15
+ # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
16
+ # IgnoredMethods: refine
17
+ Metrics/BlockLength:
18
+ Max: 269
19
+
20
+ # Offense count: 1
21
+ # Configuration parameters: IgnoredMethods.
22
+ Metrics/CyclomaticComplexity:
23
+ Max: 9
24
+
25
+ # Offense count: 3
26
+ # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
27
+ Metrics/MethodLength:
28
+ Max: 25
29
+
30
+ # Offense count: 1
31
+ # Configuration parameters: IgnoredMethods.
32
+ Metrics/PerceivedComplexity:
33
+ Max: 10
data/.solargraph.yml ADDED
@@ -0,0 +1,17 @@
1
+ ---
2
+ include:
3
+ - "**/*.rb"
4
+ exclude:
5
+ - spec/**/*
6
+ - test/**/*
7
+ - vendor/**/*
8
+ - ".bundle/**/*"
9
+ require: []
10
+ domains: []
11
+ reporters:
12
+ - rubocop
13
+ - require_not_found
14
+ - typecheck:typed
15
+ require_paths: []
16
+ plugins: []
17
+ max_files: 5000
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in srx.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,84 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ srx (0.1.0)
5
+ nokogiri (~> 1.11)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ ast (2.4.2)
11
+ backport (1.1.2)
12
+ benchmark (0.1.1)
13
+ byebug (11.1.3)
14
+ diff-lcs (1.4.4)
15
+ e2mmap (0.1.0)
16
+ jaro_winkler (1.5.4)
17
+ kramdown (2.3.0)
18
+ rexml
19
+ kramdown-parser-gfm (1.1.0)
20
+ kramdown (~> 2.0)
21
+ memory_profiler (1.0.0)
22
+ minitest (5.14.3)
23
+ nokogiri (1.11.1-x86_64-darwin)
24
+ racc (~> 1.4)
25
+ parallel (1.20.1)
26
+ parser (3.0.0.0)
27
+ ast (~> 2.4.1)
28
+ racc (1.5.2)
29
+ rainbow (3.0.0)
30
+ rake (13.0.3)
31
+ regexp_parser (2.0.3)
32
+ reverse_markdown (2.0.0)
33
+ nokogiri
34
+ rexml (3.2.4)
35
+ rspec-expectations (3.10.1)
36
+ diff-lcs (>= 1.2.0, < 2.0)
37
+ rspec-support (~> 3.10.0)
38
+ rspec-support (3.10.2)
39
+ rubocop (1.9.1)
40
+ parallel (~> 1.10)
41
+ parser (>= 3.0.0.0)
42
+ rainbow (>= 2.2.2, < 4.0)
43
+ regexp_parser (>= 1.8, < 3.0)
44
+ rexml
45
+ rubocop-ast (>= 1.2.0, < 2.0)
46
+ ruby-progressbar (~> 1.7)
47
+ unicode-display_width (>= 1.4.0, < 3.0)
48
+ rubocop-ast (1.4.1)
49
+ parser (>= 2.7.1.5)
50
+ ruby-progressbar (1.11.0)
51
+ solargraph (0.40.3)
52
+ backport (~> 1.1)
53
+ benchmark
54
+ bundler (>= 1.17.2)
55
+ e2mmap
56
+ jaro_winkler (~> 1.5)
57
+ kramdown (~> 2.3)
58
+ kramdown-parser-gfm (~> 1.1)
59
+ parser (~> 3.0)
60
+ reverse_markdown (>= 1.0.5, < 3)
61
+ rubocop (>= 0.52)
62
+ thor (~> 1.0)
63
+ tilt (~> 2.0)
64
+ yard (~> 0.9, >= 0.9.24)
65
+ thor (1.1.0)
66
+ tilt (2.0.10)
67
+ unicode-display_width (2.0.0)
68
+ yard (0.9.26)
69
+
70
+ PLATFORMS
71
+ x86_64-darwin-20
72
+
73
+ DEPENDENCIES
74
+ byebug
75
+ memory_profiler
76
+ minitest
77
+ rake
78
+ rspec-expectations
79
+ rubocop
80
+ solargraph
81
+ srx!
82
+
83
+ BUNDLED WITH
84
+ 2.2.7
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Aaron Madlon-Kay
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,120 @@
1
+ # SRX for Ruby
2
+
3
+ SRX is a specification for segmenting text, i.e. splitting text into sentences.
4
+ More specifically it is
5
+
6
+ - An XML-based format for specifying segmentation rules, and
7
+ - An algorithm by which the rules are applied
8
+
9
+ See the [SRX 2.0 Specification](http://www.ttt.org/oscarStandards/srx/srx20.html)
10
+ for full details.
11
+
12
+ This gem provides facilities for reading SRX files and an engine for performing
13
+ segmentation.
14
+
15
+ Only a minimal rule set is supplied by default; for actual usage you are
16
+ encouraged to supply your own SRX rules.
17
+
18
+ ## What's different about this gem?
19
+
20
+ There are lots of good segmentation gems out there such as
21
+
22
+ - [pragmatic_segmenter](https://github.com/diasks2/pragmatic_segmenter)
23
+ - [TactfulTokenizer](https://github.com/zencephalon/Tactful_Tokenizer)
24
+ - [Punkt](https://github.com/lfcipriani/punkt-segmenter)
25
+
26
+ What makes SRX different is:
27
+
28
+ - It allows easy customization and exchange of rules via SRX files
29
+ - It preserves whitespace surrounding break points
30
+ - It offers advanced XML/HTML tag handling: it won't be fooled by false breaks
31
+ in e.g. attribute values
32
+
33
+ Some other advantages that are not unique to SRX:
34
+
35
+ - It is offered under a very permissive license
36
+ - It is relatively lightweight as a dependency
37
+ - It is fast (though this depends somewhat on the ruleset you use)
38
+
39
+ Some disadvantages:
40
+
41
+ - It is inherently rule-based, with all of the weaknesses that implies
42
+ - It is not very accurate on the [Golden Rules
43
+ test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
44
+ scoring 47% (English) and 48% (others) with the default rules. However you can
45
+ improve on that with better rules such as
46
+ [LanguageTool's](https://github.com/languagetool-org/languagetool/blob/05707300df14668e97d064811931e0668f2b695b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx).
47
+
48
+ ## Installation
49
+
50
+ Add this line to your application's Gemfile:
51
+
52
+ ```ruby
53
+ gem 'srx'
54
+ ```
55
+
56
+ And then execute:
57
+
58
+ $ bundle install
59
+
60
+ Or install it yourself as:
61
+
62
+ $ gem install srx
63
+
64
+ ## Usage
65
+
66
+ Use the default rules like so. Specify the language according the `<maprules>`
67
+ of your SRX (usually two-letter [ISO 639-1
68
+ codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)).
69
+
70
+ ```ruby
71
+ require 'srx'
72
+
73
+ data = Srx::Data.default
74
+ engine = Srx::Engine.new(data)
75
+ engine.segment('Hi. How are you?', language: 'en') #=> ["Hi.", " How are you?"]
76
+ ```
77
+
78
+ Or bring your own rules:
79
+
80
+ ```ruby
81
+ data = Srx::Data.from_file(path: 'path/to/my/rules.srx')
82
+ engine = Srx::Engine.new(data)
83
+ ```
84
+
85
+ Specify the format as `:xml` or `:html` to benefit from special handling of
86
+ tags:
87
+
88
+ ```ruby
89
+ # This should only be one segment, but handling as plain text incorrectly
90
+ # produces two segments.
91
+ input = 'foo <bar baz="a. b."> bazinga'
92
+
93
+ Srx::Engine.new(Data.default).segment(input, language: 'en')
94
+ #=> ["foo <bar baz=\"a.", " b.\"> bazinga"]
95
+
96
+ Srx::Engine.new(data, format: :xml).segment(input, language: 'en')
97
+ #=> ["foo <bar baz=\"a. b.\"> bazinga"]
98
+ ```
99
+
100
+ ## Development
101
+
102
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run
103
+ `rake test` to run the tests. You can also run `bin/console` for an interactive
104
+ prompt that will allow you to experiment.
105
+
106
+ To install this gem onto your local machine, run `bundle exec rake install`. To
107
+ release a new version, update the version number in `version.rb`, and then run
108
+ `bundle exec rake release`, which will create a git tag for the version, push
109
+ git commits and the created tag, and push the `.gem` file to
110
+ [rubygems.org](https://rubygems.org).
111
+
112
+ ## Contributing
113
+
114
+ Bug reports and pull requests are welcome on GitHub at
115
+ https://github.com/amake/srx.
116
+
117
+ ## License
118
+
119
+ The gem is available as open source under the terms of the [MIT
120
+ License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rake/testtask'
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << 'test'
8
+ t.libs << 'lib'
9
+ t.test_files = FileList['test/**/*_test.rb']
10
+ end
11
+
12
+ require 'rubocop/rake_task'
13
+
14
+ RuboCop::RakeTask.new
15
+
16
+ task default: %i[test rubocop]
data/bin/benchmark ADDED
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'srx'
6
+ require 'optparse'
7
+ require 'benchmark'
8
+
9
+ options = {}
10
+ OptionParser.new do |opts|
11
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
12
+
13
+ opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
14
+ opts.on('-fFORMAT', '--format FORMAT', 'Format of input text (default: text)')
15
+ end.parse!(into: options)
16
+
17
+ data = if options[:srx]
18
+ Srx::Data.from_file(path: options[:srx])
19
+ else
20
+ Srx::Data.default
21
+ end
22
+ format = options[:format]&.to_sym || :text
23
+ engine = Srx::Engine.new(data, format: format)
24
+
25
+ license_text = File.open(File.expand_path('../LICENSE.txt', __dir__), &:read).strip.then { |t| Srx::Util.unwrap(t) }
26
+
27
+ # Golden Rules speed test; see
28
+ # https://github.com/diasks2/pragmatic_segmenter#speed-performance-benchmarks
29
+ gold_text = Srx::Util.unwrap(<<~TXT)
30
+ Hello World. My name is Jonas.
31
+ What is your name? My name is Jonas.
32
+ There it is! I found it.
33
+ My name is Jonas E. Smith.
34
+ Please turn to p. 55.
35
+ Were Jane and co. at the party?
36
+ They closed the deal with Pitt, Briggs & Co. at noon.
37
+ Let's ask Jane and co. They should know.
38
+ They closed the deal with Pitt, Briggs & Co. It closed yesterday.
39
+ I can see Mt. Fuji from here.
40
+ St. Michael's Church is on 5th st. near the light.
41
+ That is JFK Jr.'s book.
42
+ I visited the U.S.A. last year.
43
+ I live in the E.U. How about you?
44
+ I live in the U.S. How about you?
45
+ I work for the U.S. Government in Virginia.
46
+ I have lived in the U.S. for 20 years.
47
+ At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.
48
+ She has $100.00 in her bag.
49
+ She has $100.00. It is in her bag.
50
+ He teaches science (He previously worked for 5 years as an engineer.) at the local University.
51
+ Her email is Jane.Doe@example.com. I sent her an email.
52
+ The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.
53
+ She turned to him, 'This is great.' she said.
54
+ She turned to him, "This is great." she said.
55
+ She turned to him, "This is great." She held the book out to show him.
56
+ Hello!! Long time no see.
57
+ Hello?? Who is there?
58
+ Hello!? Is that you?
59
+ Hello?! Is that you?
60
+ 1.) The first item 2.) The second item
61
+ 1.) The first item. 2.) The second item.
62
+ 1) The first item 2) The second item
63
+ 1) The first item. 2) The second item.
64
+ 1. The first item 2. The second item
65
+ 1. The first item. 2. The second item.
66
+ • 9. The first item • 10. The second item
67
+ ⁃9. The first item ⁃10. The second item
68
+ a. The first item b. The second item c. The third list item
69
+ This is a sentence
70
+ cut off in the middle because pdf.
71
+ It was a cold#{' '}
72
+ night in the city.
73
+ features
74
+ contact manager
75
+ events, activities
76
+
77
+ You can find it at N°. 1026.253.553. That is where the treasure is.
78
+ She works at Yahoo! in the accounting department.
79
+ We make a good team, you and I. Did you see Albert I. Jones yesterday?
80
+ Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”
81
+ "Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).
82
+ If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.
83
+ I never meant that.... She left the store.
84
+ I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.
85
+ One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .
86
+ TXT
87
+
88
+ n = 100
89
+
90
+ Benchmark.bm do |x|
91
+ x.report('LICENSE.txt (en)') { n.times { engine.segment(license_text, language: 'en') } }
92
+ x.report('LICENSE.txt (zz)') { n.times { engine.segment(license_text, language: 'zz') } }
93
+ x.report('Golden Rules (en)') { n.times { engine.segment(gold_text, language: 'en') } }
94
+ end