srx 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 18c2fc1da5f4a792393a9dd4955c79d73bc9d00943000f38689a5763d8c8ff4c
4
+ data.tar.gz: 2385b9f65e61ad291e1419e217b737653bbeea0d4f62dc7fbb4ae35e6948984c
5
+ SHA512:
6
+ metadata.gz: 915b837e6a239f7de688d51ca3fca2bbdcf898234f825cbd49894223e8c31f71167d6a22d9097a36ab75ea3fc8f383ae790d3df8e0e3fce2fb8210022412fad2
7
+ data.tar.gz: 326d3de2c328aa49be07c7df62229fe40c8cf3b2c4101f090b9c805b9e3a2fcd1c8eec155b6b3af9c556ff5c548d5263425975bdc9b0368b334342ce47dd71d9
data/.dir-locals.el ADDED
@@ -0,0 +1,4 @@
1
+ ;;; Directory Local Variables
2
+ ;;; For more information see (info "(emacs) Directory Variables")
3
+
4
+ ((ruby-mode . ((lsp-solargraph-use-bundler . t))))
@@ -0,0 +1,21 @@
1
+ name: Ruby
2
+
3
+ on: [push,pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v2
10
+ - name: Set up Ruby
11
+ uses: ruby/setup-ruby@v1
12
+ with:
13
+ ruby-version: 2.7.2
14
+ - name: Install
15
+ run: |
16
+ gem install bundler -v 2.2.7
17
+ bundle install
18
+ - name: Type check
19
+ run: bundle exec solargraph typecheck --level typed
20
+ - name: Unit tests
21
+ run: bundle exec rake
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ /vendor
10
+
11
+ .byebug_history
data/.rubocop.yml ADDED
@@ -0,0 +1,13 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
3
+ AllCops:
4
+ TargetRubyVersion: 2.4
5
+ SuggestExtensions: false
6
+ NewCops: enable
7
+ Exclude:
8
+ - '**/*~'
9
+
10
+ Layout/LineLength:
11
+ Max: 120
12
+ Exclude:
13
+ - 'test/srx/golden_rules_test.rb'
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,33 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2021-02-08 14:52:03 UTC using RuboCop version 1.9.1.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 3
10
+ # Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
11
+ Metrics/AbcSize:
12
+ Max: 24
13
+
14
+ # Offense count: 6
15
+ # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
16
+ # IgnoredMethods: refine
17
+ Metrics/BlockLength:
18
+ Max: 269
19
+
20
+ # Offense count: 1
21
+ # Configuration parameters: IgnoredMethods.
22
+ Metrics/CyclomaticComplexity:
23
+ Max: 9
24
+
25
+ # Offense count: 3
26
+ # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
27
+ Metrics/MethodLength:
28
+ Max: 25
29
+
30
+ # Offense count: 1
31
+ # Configuration parameters: IgnoredMethods.
32
+ Metrics/PerceivedComplexity:
33
+ Max: 10
data/.solargraph.yml ADDED
@@ -0,0 +1,17 @@
1
+ ---
2
+ include:
3
+ - "**/*.rb"
4
+ exclude:
5
+ - spec/**/*
6
+ - test/**/*
7
+ - vendor/**/*
8
+ - ".bundle/**/*"
9
+ require: []
10
+ domains: []
11
+ reporters:
12
+ - rubocop
13
+ - require_not_found
14
+ - typecheck:typed
15
+ require_paths: []
16
+ plugins: []
17
+ max_files: 5000
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in srx.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,84 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ srx (0.1.0)
5
+ nokogiri (~> 1.11)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ ast (2.4.2)
11
+ backport (1.1.2)
12
+ benchmark (0.1.1)
13
+ byebug (11.1.3)
14
+ diff-lcs (1.4.4)
15
+ e2mmap (0.1.0)
16
+ jaro_winkler (1.5.4)
17
+ kramdown (2.3.0)
18
+ rexml
19
+ kramdown-parser-gfm (1.1.0)
20
+ kramdown (~> 2.0)
21
+ memory_profiler (1.0.0)
22
+ minitest (5.14.3)
23
+ nokogiri (1.11.1-x86_64-darwin)
24
+ racc (~> 1.4)
25
+ parallel (1.20.1)
26
+ parser (3.0.0.0)
27
+ ast (~> 2.4.1)
28
+ racc (1.5.2)
29
+ rainbow (3.0.0)
30
+ rake (13.0.3)
31
+ regexp_parser (2.0.3)
32
+ reverse_markdown (2.0.0)
33
+ nokogiri
34
+ rexml (3.2.4)
35
+ rspec-expectations (3.10.1)
36
+ diff-lcs (>= 1.2.0, < 2.0)
37
+ rspec-support (~> 3.10.0)
38
+ rspec-support (3.10.2)
39
+ rubocop (1.9.1)
40
+ parallel (~> 1.10)
41
+ parser (>= 3.0.0.0)
42
+ rainbow (>= 2.2.2, < 4.0)
43
+ regexp_parser (>= 1.8, < 3.0)
44
+ rexml
45
+ rubocop-ast (>= 1.2.0, < 2.0)
46
+ ruby-progressbar (~> 1.7)
47
+ unicode-display_width (>= 1.4.0, < 3.0)
48
+ rubocop-ast (1.4.1)
49
+ parser (>= 2.7.1.5)
50
+ ruby-progressbar (1.11.0)
51
+ solargraph (0.40.3)
52
+ backport (~> 1.1)
53
+ benchmark
54
+ bundler (>= 1.17.2)
55
+ e2mmap
56
+ jaro_winkler (~> 1.5)
57
+ kramdown (~> 2.3)
58
+ kramdown-parser-gfm (~> 1.1)
59
+ parser (~> 3.0)
60
+ reverse_markdown (>= 1.0.5, < 3)
61
+ rubocop (>= 0.52)
62
+ thor (~> 1.0)
63
+ tilt (~> 2.0)
64
+ yard (~> 0.9, >= 0.9.24)
65
+ thor (1.1.0)
66
+ tilt (2.0.10)
67
+ unicode-display_width (2.0.0)
68
+ yard (0.9.26)
69
+
70
+ PLATFORMS
71
+ x86_64-darwin-20
72
+
73
+ DEPENDENCIES
74
+ byebug
75
+ memory_profiler
76
+ minitest
77
+ rake
78
+ rspec-expectations
79
+ rubocop
80
+ solargraph
81
+ srx!
82
+
83
+ BUNDLED WITH
84
+ 2.2.7
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Aaron Madlon-Kay
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,120 @@
1
+ # SRX for Ruby
2
+
3
+ SRX is a specification for segmenting text, i.e. splitting text into sentences.
4
+ More specifically it is
5
+
6
+ - An XML-based format for specifying segmentation rules, and
7
+ - An algorithm by which the rules are applied
8
+
9
+ See the [SRX 2.0 Specification](http://www.ttt.org/oscarStandards/srx/srx20.html)
10
+ for full details.
11
+
12
+ This gem provides facilities for reading SRX files and an engine for performing
13
+ segmentation.
14
+
15
+ Only a minimal rule set is supplied by default; for actual usage you are
16
+ encouraged to supply your own SRX rules.
17
+
18
+ ## What's different about this gem?
19
+
20
+ There are lots of good segmentation gems out there such as
21
+
22
+ - [pragmatic_segmenter](https://github.com/diasks2/pragmatic_segmenter)
23
+ - [TactfulTokenizer](https://github.com/zencephalon/Tactful_Tokenizer)
24
+ - [Punkt](https://github.com/lfcipriani/punkt-segmenter)
25
+
26
+ What makes SRX different is:
27
+
28
+ - It allows easy customization and exchange of rules via SRX files
29
+ - It preserves whitespace surrounding break points
30
+ - It offers advanced XML/HTML tag handling: it won't be fooled by false breaks
31
+ in e.g. attribute values
32
+
33
+ Some other advantages that are not unique to SRX:
34
+
35
+ - It is offered under a very permissive license
36
+ - It is relatively lightweight as a dependency
37
+ - It is fast (though this depends somewhat on the ruleset you use)
38
+
39
+ Some disadvantages:
40
+
41
+ - It is inherently rule-based, with all of the weaknesses that implies
42
+ - It is not very accurate on the [Golden Rules
43
+ test](https://github.com/diasks2/pragmatic_segmenter#comparison-of-segmentation-tools-libraries-and-algorithms),
44
+ scoring 47% (English) and 48% (others) with the default rules. However you can
45
+ improve on that with better rules such as
46
+ [LanguageTool's](https://github.com/languagetool-org/languagetool/blob/05707300df14668e97d064811931e0668f2b695b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx).
47
+
48
+ ## Installation
49
+
50
+ Add this line to your application's Gemfile:
51
+
52
+ ```ruby
53
+ gem 'srx'
54
+ ```
55
+
56
+ And then execute:
57
+
58
+ $ bundle install
59
+
60
+ Or install it yourself as:
61
+
62
+ $ gem install srx
63
+
64
+ ## Usage
65
+
66
+ Use the default rules like so. Specify the language according the `<maprules>`
67
+ of your SRX (usually two-letter [ISO 639-1
68
+ codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)).
69
+
70
+ ```ruby
71
+ require 'srx'
72
+
73
+ data = Srx::Data.default
74
+ engine = Srx::Engine.new(data)
75
+ engine.segment('Hi. How are you?', language: 'en') #=> ["Hi.", " How are you?"]
76
+ ```
77
+
78
+ Or bring your own rules:
79
+
80
+ ```ruby
81
+ data = Srx::Data.from_file(path: 'path/to/my/rules.srx')
82
+ engine = Srx::Engine.new(data)
83
+ ```
84
+
85
+ Specify the format as `:xml` or `:html` to benefit from special handling of
86
+ tags:
87
+
88
+ ```ruby
89
+ # This should only be one segment, but handling as plain text incorrectly
90
+ # produces two segments.
91
+ input = 'foo <bar baz="a. b."> bazinga'
92
+
93
+ Srx::Engine.new(Data.default).segment(input, language: 'en')
94
+ #=> ["foo <bar baz=\"a.", " b.\"> bazinga"]
95
+
96
+ Srx::Engine.new(data, format: :xml).segment(input, language: 'en')
97
+ #=> ["foo <bar baz=\"a. b.\"> bazinga"]
98
+ ```
99
+
100
+ ## Development
101
+
102
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run
103
+ `rake test` to run the tests. You can also run `bin/console` for an interactive
104
+ prompt that will allow you to experiment.
105
+
106
+ To install this gem onto your local machine, run `bundle exec rake install`. To
107
+ release a new version, update the version number in `version.rb`, and then run
108
+ `bundle exec rake release`, which will create a git tag for the version, push
109
+ git commits and the created tag, and push the `.gem` file to
110
+ [rubygems.org](https://rubygems.org).
111
+
112
+ ## Contributing
113
+
114
+ Bug reports and pull requests are welcome on GitHub at
115
+ https://github.com/amake/srx.
116
+
117
+ ## License
118
+
119
+ The gem is available as open source under the terms of the [MIT
120
+ License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rake/testtask'
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << 'test'
8
+ t.libs << 'lib'
9
+ t.test_files = FileList['test/**/*_test.rb']
10
+ end
11
+
12
+ require 'rubocop/rake_task'
13
+
14
+ RuboCop::RakeTask.new
15
+
16
+ task default: %i[test rubocop]
data/bin/benchmark ADDED
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'srx'
6
+ require 'optparse'
7
+ require 'benchmark'
8
+
9
+ options = {}
10
+ OptionParser.new do |opts|
11
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
12
+
13
+ opts.on('-sFILE', '--srx FILE', 'SRX file (optional)')
14
+ opts.on('-fFORMAT', '--format FORMAT', 'Format of input text (default: text)')
15
+ end.parse!(into: options)
16
+
17
+ data = if options[:srx]
18
+ Srx::Data.from_file(path: options[:srx])
19
+ else
20
+ Srx::Data.default
21
+ end
22
+ format = options[:format]&.to_sym || :text
23
+ engine = Srx::Engine.new(data, format: format)
24
+
25
+ license_text = File.open(File.expand_path('../LICENSE.txt', __dir__), &:read).strip.then { |t| Srx::Util.unwrap(t) }
26
+
27
+ # Golden Rules speed test; see
28
+ # https://github.com/diasks2/pragmatic_segmenter#speed-performance-benchmarks
29
+ gold_text = Srx::Util.unwrap(<<~TXT)
30
+ Hello World. My name is Jonas.
31
+ What is your name? My name is Jonas.
32
+ There it is! I found it.
33
+ My name is Jonas E. Smith.
34
+ Please turn to p. 55.
35
+ Were Jane and co. at the party?
36
+ They closed the deal with Pitt, Briggs & Co. at noon.
37
+ Let's ask Jane and co. They should know.
38
+ They closed the deal with Pitt, Briggs & Co. It closed yesterday.
39
+ I can see Mt. Fuji from here.
40
+ St. Michael's Church is on 5th st. near the light.
41
+ That is JFK Jr.'s book.
42
+ I visited the U.S.A. last year.
43
+ I live in the E.U. How about you?
44
+ I live in the U.S. How about you?
45
+ I work for the U.S. Government in Virginia.
46
+ I have lived in the U.S. for 20 years.
47
+ At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.
48
+ She has $100.00 in her bag.
49
+ She has $100.00. It is in her bag.
50
+ He teaches science (He previously worked for 5 years as an engineer.) at the local University.
51
+ Her email is Jane.Doe@example.com. I sent her an email.
52
+ The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.
53
+ She turned to him, 'This is great.' she said.
54
+ She turned to him, "This is great." she said.
55
+ She turned to him, "This is great." She held the book out to show him.
56
+ Hello!! Long time no see.
57
+ Hello?? Who is there?
58
+ Hello!? Is that you?
59
+ Hello?! Is that you?
60
+ 1.) The first item 2.) The second item
61
+ 1.) The first item. 2.) The second item.
62
+ 1) The first item 2) The second item
63
+ 1) The first item. 2) The second item.
64
+ 1. The first item 2. The second item
65
+ 1. The first item. 2. The second item.
66
+ • 9. The first item • 10. The second item
67
+ ⁃9. The first item ⁃10. The second item
68
+ a. The first item b. The second item c. The third list item
69
+ This is a sentence
70
+ cut off in the middle because pdf.
71
+ It was a cold#{' '}
72
+ night in the city.
73
+ features
74
+ contact manager
75
+ events, activities
76
+
77
+ You can find it at N°. 1026.253.553. That is where the treasure is.
78
+ She works at Yahoo! in the accounting department.
79
+ We make a good team, you and I. Did you see Albert I. Jones yesterday?
80
+ Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”
81
+ "Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).
82
+ If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.
83
+ I never meant that.... She left the store.
84
+ I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.
85
+ One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .
86
+ TXT
87
+
88
+ n = 100
89
+
90
+ Benchmark.bm do |x|
91
+ x.report('LICENSE.txt (en)') { n.times { engine.segment(license_text, language: 'en') } }
92
+ x.report('LICENSE.txt (zz)') { n.times { engine.segment(license_text, language: 'zz') } }
93
+ x.report('Golden Rules (en)') { n.times { engine.segment(gold_text, language: 'en') } }
94
+ end