splitta 4.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.codeclimate.yml +20 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +135 -0
- data/.ruby-version +1 -0
- data/.travis.yml +15 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE +21 -0
- data/README.md +47 -0
- data/Rakefile +55 -0
- data/data/feats +0 -0
- data/data/lower_words +0 -0
- data/data/non_abbrs +0 -0
- data/data/src/feats +0 -0
- data/data/src/lower_words +0 -0
- data/data/src/non_abbrs +0 -0
- data/lib/splitta.rb +21 -0
- data/lib/splitta/doc.rb +50 -0
- data/lib/splitta/frag.rb +101 -0
- data/lib/splitta/model.rb +70 -0
- data/lib/splitta/version.rb +6 -0
- data/lib/splitta/word_tokenizer.rb +71 -0
- data/splitta.gemspec +27 -0
- metadata +151 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6d8e46cae786dd0a359ae4ba07fc88991c8655957635a12e24acc73954005a9c
|
4
|
+
data.tar.gz: 835385acb2401790fd2c077607c77b7a45621e02317b8b83fd7cff869898aefc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 698ac3cbdaf4671b86f931e963c2b1bd7cbb193c381ac4ffb5817b66a6849a8da8a28c73ce2a1a322e6891736765802a80f1e7a43da4ff977e998b97e5adb392
|
7
|
+
data.tar.gz: f959bd4dd3a79f687321aafc78fd1e20e8f39964610da758779d7b52038232b3f41e150ea5c11b197b8bd58313c048a956a069eeb5a89e8b4bd3477b2bbe9cd7
|
data/.codeclimate.yml
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
---
|
2
|
+
engines:
|
3
|
+
duplication:
|
4
|
+
enabled: true
|
5
|
+
config:
|
6
|
+
languages:
|
7
|
+
- ruby
|
8
|
+
checks:
|
9
|
+
Similar code:
|
10
|
+
enabled: false
|
11
|
+
fixme:
|
12
|
+
enabled: true
|
13
|
+
rubocop:
|
14
|
+
enabled: true
|
15
|
+
channel: rubocop-0-85
|
16
|
+
ratings:
|
17
|
+
paths:
|
18
|
+
- "**.rb"
|
19
|
+
exclude_paths:
|
20
|
+
- spec/
|
data/.gitignore
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
AllCops:
|
2
|
+
Exclude:
|
3
|
+
- splitta.gemspec
|
4
|
+
- Rakefile
|
5
|
+
|
6
|
+
Layout/EmptyLinesAroundAttributeAccessor:
|
7
|
+
Enabled: true
|
8
|
+
|
9
|
+
Layout/EmptyLinesAroundClassBody:
|
10
|
+
EnforcedStyle: empty_lines_except_namespace
|
11
|
+
|
12
|
+
Layout/EmptyLinesAroundModuleBody:
|
13
|
+
EnforcedStyle: empty_lines_except_namespace
|
14
|
+
|
15
|
+
Layout/ExtraSpacing:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Layout/LineLength:
|
19
|
+
Max: 120
|
20
|
+
Enabled: false
|
21
|
+
|
22
|
+
Layout/SpaceAroundMethodCallOperator:
|
23
|
+
Enabled: true
|
24
|
+
|
25
|
+
Lint/DeprecatedOpenSSLConstant:
|
26
|
+
Enabled: true
|
27
|
+
|
28
|
+
Lint/MixedRegexpCaptureTypes:
|
29
|
+
Enabled: true
|
30
|
+
|
31
|
+
Lint/RaiseException:
|
32
|
+
Enabled: true
|
33
|
+
|
34
|
+
Lint/StructNewOverride:
|
35
|
+
Enabled: true
|
36
|
+
|
37
|
+
Metrics/AbcSize:
|
38
|
+
Max: 50
|
39
|
+
Enabled: false
|
40
|
+
|
41
|
+
Metrics/BlockLength:
|
42
|
+
Max: 50
|
43
|
+
Enabled: false
|
44
|
+
|
45
|
+
Metrics/ClassLength:
|
46
|
+
Max: 50
|
47
|
+
Enabled: false
|
48
|
+
|
49
|
+
Metrics/CyclomaticComplexity:
|
50
|
+
Max: 30
|
51
|
+
Enabled: false
|
52
|
+
|
53
|
+
Metrics/MethodLength:
|
54
|
+
Max: 20
|
55
|
+
Enabled: false
|
56
|
+
|
57
|
+
Metrics/ModuleLength:
|
58
|
+
Max: 1000
|
59
|
+
Enabled: false
|
60
|
+
|
61
|
+
Metrics/PerceivedComplexity:
|
62
|
+
Max: 30
|
63
|
+
Enabled: false
|
64
|
+
|
65
|
+
Security/MarshalLoad:
|
66
|
+
Enabled: false
|
67
|
+
|
68
|
+
Style/AndOr:
|
69
|
+
Enabled: false
|
70
|
+
|
71
|
+
Style/CaseEquality:
|
72
|
+
Enabled: false
|
73
|
+
|
74
|
+
Style/Documentation:
|
75
|
+
Enabled: false
|
76
|
+
|
77
|
+
Style/DoubleNegation:
|
78
|
+
Enabled: false
|
79
|
+
|
80
|
+
Style/ExponentialNotation:
|
81
|
+
Enabled: true
|
82
|
+
|
83
|
+
Style/FrozenStringLiteralComment:
|
84
|
+
Enabled: false
|
85
|
+
|
86
|
+
Style/GuardClause:
|
87
|
+
Enabled: false
|
88
|
+
|
89
|
+
Style/HashEachMethods:
|
90
|
+
Enabled: true
|
91
|
+
|
92
|
+
Style/HashTransformKeys:
|
93
|
+
Enabled: true
|
94
|
+
|
95
|
+
Style/HashTransformValues:
|
96
|
+
Enabled: true
|
97
|
+
|
98
|
+
Style/IfUnlessModifier:
|
99
|
+
Enabled: false
|
100
|
+
|
101
|
+
Style/MultilineBlockChain:
|
102
|
+
Enabled: false
|
103
|
+
|
104
|
+
Style/MultilineIfModifier:
|
105
|
+
Enabled: false
|
106
|
+
|
107
|
+
Style/MutableConstant:
|
108
|
+
Enabled: false
|
109
|
+
|
110
|
+
Style/RedundantRegexpCharacterClass:
|
111
|
+
Enabled: true
|
112
|
+
|
113
|
+
Style/RedundantRegexpEscape:
|
114
|
+
Enabled: true
|
115
|
+
|
116
|
+
Style/RescueModifier:
|
117
|
+
Enabled: false
|
118
|
+
|
119
|
+
Style/RescueStandardError:
|
120
|
+
Enabled: false
|
121
|
+
|
122
|
+
Style/SlicingWithRange:
|
123
|
+
Enabled: true
|
124
|
+
|
125
|
+
Style/TrailingCommaInArguments:
|
126
|
+
EnforcedStyleForMultiline: comma
|
127
|
+
|
128
|
+
Style/TrailingCommaInArrayLiteral:
|
129
|
+
EnforcedStyleForMultiline: consistent_comma
|
130
|
+
|
131
|
+
Style/TrailingCommaInHashLiteral:
|
132
|
+
EnforcedStyleForMultiline: consistent_comma
|
133
|
+
|
134
|
+
Style/ZeroLengthPredicate:
|
135
|
+
Enabled: false
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby-2.5.1
|
data/.travis.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
env:
|
2
|
+
global:
|
3
|
+
- CC_TEST_REPORTER_ID=35dd438da367278dc0d80cb7e21b3b0cd6eb207f60890a05c3fe1f0882981e64
|
4
|
+
language: ruby
|
5
|
+
rvm:
|
6
|
+
- 2.5.1
|
7
|
+
before_install: gem install bundler -v 1.17.3
|
8
|
+
before_script:
|
9
|
+
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
10
|
+
- chmod +x ./cc-test-reporter
|
11
|
+
- ./cc-test-reporter before-build
|
12
|
+
script:
|
13
|
+
- bundle exec rspec
|
14
|
+
after_script:
|
15
|
+
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at david@bloomfire.com. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2020 David McCullars
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# Ruby Splitta
|
2
|
+
|
3
|
+
* README: https://github.com/david-mccullars/ruby-splitta
|
4
|
+
* Documentation: http://www.rubydoc.info/github/david-mccullars/ruby-splitta
|
5
|
+
* Bug Reports: https://github.com/david-mccullars/ruby-splitta/issues
|
6
|
+
|
7
|
+
|
8
|
+
## Status
|
9
|
+
|
10
|
+
[![Gem Version](https://badge.fury.io/rb/splitta.svg)](https://badge.fury.io/rb/splitta)
|
11
|
+
[![Travis Build Status](https://travis-ci.org/david-mccullars/ruby-splitta.svg?branch=master)](https://travis-ci.org/david-mccullars/ruby-splitta)
|
12
|
+
[![Code Climate](https://codeclimate.com/github/david-mccullars/ruby-splitta/badges/gpa.svg)](https://codeclimate.com/github/david-mccullars/ruby-splitta)
|
13
|
+
[![Test Coverage](https://codeclimate.com/github/david-mccullars/ruby-splitta/badges/coverage.svg)](https://codeclimate.com/github/david-mccullars/ruby-splitta/coverage)
|
14
|
+
|
15
|
+
## Description
|
16
|
+
|
17
|
+
[Splitta](https://code.google.com/archive/p/splitta/) Includes proper
|
18
|
+
tokenization and models for very high accuracy sentence boundary detection
|
19
|
+
(English only for now). The models are trained from Wall Street Journal news
|
20
|
+
combined with the Brown Corpus which is intended to be widely representative of
|
21
|
+
written English. Error rates on test news data are near 0.25%.
|
22
|
+
|
23
|
+
## Installation
|
24
|
+
|
25
|
+
```
|
26
|
+
gem install splitta
|
27
|
+
```
|
28
|
+
|
29
|
+
## Requirements
|
30
|
+
|
31
|
+
* Ruby 2.5.1 or higher
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
require 'splitta'
|
37
|
+
|
38
|
+
Splitta.sentences("Some text goes here.")
|
39
|
+
```
|
40
|
+
|
41
|
+
## License
|
42
|
+
|
43
|
+
MIT. See the `LICENSE` file.
|
44
|
+
|
45
|
+
## References
|
46
|
+
|
47
|
+
> Dan Gillick, “Sentence Boundary Detection and the Problem with the U.S.” at NAACL 2009, http://dgillick.com/resource/sbd_naacl_2009.pdf
|
data/Rakefile
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
|
4
|
+
RSpec::Core::RakeTask.new(:spec)
|
5
|
+
|
6
|
+
task :default => :spec
|
7
|
+
|
8
|
+
require 'rdoc/task'
|
9
|
+
RDoc::Task.new do |rdoc|
|
10
|
+
rdoc.main = "README.md"
|
11
|
+
rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
|
12
|
+
end
|
13
|
+
|
14
|
+
def unpickle(file)
|
15
|
+
require 'rubypython'
|
16
|
+
|
17
|
+
RubyPython.run do
|
18
|
+
cPickle = import 'cPickle'
|
19
|
+
gzip = import 'gzip'
|
20
|
+
|
21
|
+
io = gzip.open(file, 'rb')
|
22
|
+
data = cPickle.load(io)
|
23
|
+
io.close()
|
24
|
+
|
25
|
+
data = data.rubify
|
26
|
+
if data.keys.first.is_a?(RubyPython::Tuple)
|
27
|
+
data = data.each_with_object({}) do |(k, v), h|
|
28
|
+
h[k.to_a] = v
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
return data
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def gzip_dump(file, obj)
|
37
|
+
require 'fileutils'
|
38
|
+
require 'zlib'
|
39
|
+
|
40
|
+
FileUtils.mkdir_p(File.dirname(file))
|
41
|
+
Zlib::GzipWriter.open(file) do |gz|
|
42
|
+
gz.write(Marshal.dump(obj))
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
task :unpickle, [:file] do |_t, args|
|
47
|
+
files = Dir[args[:file] || 'data/src/**/*'].select do |f|
|
48
|
+
File.file?(f)
|
49
|
+
end
|
50
|
+
|
51
|
+
files.each do |src|
|
52
|
+
puts "Unpickling #{src} ..."
|
53
|
+
gzip_dump(src.sub('/src/', '/'), unpickle(src))
|
54
|
+
end
|
55
|
+
end
|
data/data/feats
ADDED
Binary file
|
data/data/lower_words
ADDED
Binary file
|
data/data/non_abbrs
ADDED
Binary file
|
data/data/src/feats
ADDED
Binary file
|
Binary file
|
data/data/src/non_abbrs
ADDED
Binary file
|
data/lib/splitta.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
require 'zlib'
|
3
|
+
|
4
|
+
##
|
5
|
+
# Provides convenience methods for splitting text into sentences.
|
6
|
+
#
|
7
|
+
# @see README
|
8
|
+
##
|
9
|
+
module Splitta
|
10
|
+
|
11
|
+
autoload :Doc, 'splitta/doc'
|
12
|
+
autoload :Frag, 'splitta/frag'
|
13
|
+
autoload :Model, 'splitta/model'
|
14
|
+
autoload :VERSION, 'splitta/version'
|
15
|
+
autoload :WordTokenizer, 'splitta/word_tokenizer'
|
16
|
+
|
17
|
+
def self.sentences(text)
|
18
|
+
Doc.new(text, model: Model.instance).segments.map(&:strip)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
data/lib/splitta/doc.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#
|
2
|
+
# A Document points to a collection of Frags
|
3
|
+
#
|
4
|
+
module Splitta
|
5
|
+
class Doc
|
6
|
+
|
7
|
+
FRAG_SPLITTER = /
|
8
|
+
(
|
9
|
+
[.!?] # sentence end punctuation
|
10
|
+
(?:
|
11
|
+
(?:<.*>) # extra tag
|
12
|
+
|
|
13
|
+
[”"')\]}] # right-handed punctuation to retain
|
14
|
+
)*
|
15
|
+
\s+ # must have whitespace
|
16
|
+
)
|
17
|
+
/ux
|
18
|
+
|
19
|
+
SEGMENT_THRESHOLD = 0.5
|
20
|
+
|
21
|
+
attr_reader :frags
|
22
|
+
|
23
|
+
def initialize(text, model:)
|
24
|
+
@frags = []
|
25
|
+
text.split(FRAG_SPLITTER).each_slice(2) do |frag_text|
|
26
|
+
frag = Frag.new(frag_text.join, previous_frag: @frags.last)
|
27
|
+
@frags << frag
|
28
|
+
end
|
29
|
+
model.classify(self)
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# output all the text, split according to predictions
|
34
|
+
#
|
35
|
+
def segments
|
36
|
+
Enumerator.new do |y|
|
37
|
+
io = StringIO.new
|
38
|
+
frags.each do |frag|
|
39
|
+
io << frag.orig
|
40
|
+
if frag.over?(SEGMENT_THRESHOLD)
|
41
|
+
y << io.string
|
42
|
+
io.string = ''
|
43
|
+
end
|
44
|
+
end
|
45
|
+
y << io.string unless io.string.empty?
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
data/lib/splitta/frag.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
#
|
2
|
+
# A fragment of text that ends with a possible sentence boundary
|
3
|
+
#
|
4
|
+
module Splitta
|
5
|
+
class Frag
|
6
|
+
|
7
|
+
include WordTokenizer
|
8
|
+
|
9
|
+
attr_reader :orig, :last_word, :next_word
|
10
|
+
attr_accessor :pred
|
11
|
+
|
12
|
+
def initialize(orig, previous_frag: nil)
|
13
|
+
words = clean(orig).split
|
14
|
+
previous_frag.next_word = words.first if previous_frag
|
15
|
+
@orig = orig
|
16
|
+
@last_word = words.last
|
17
|
+
end
|
18
|
+
|
19
|
+
# ... w1. (sb?) w2 ...
|
20
|
+
# Features, listed roughly in order of importance:
|
21
|
+
#
|
22
|
+
# (1) w1: word that includes a period
|
23
|
+
# (2) w2: the next word, if it exists
|
24
|
+
# (3) w1length: number of alphabetic characters in w1
|
25
|
+
# (4) w2cap: true if w2 is capitalized
|
26
|
+
# (5) both: w1 and w2
|
27
|
+
# (6) w1abbr: log count of w1 in training without a final period
|
28
|
+
# (7) w2lower: log count of w2 in training as lowercased
|
29
|
+
# (8) w1w2upper: w1 and w2 is capitalized
|
30
|
+
def features(model)
|
31
|
+
Enumerator.new do |y|
|
32
|
+
y << [:w1, w1]
|
33
|
+
y << [:w2, w2]
|
34
|
+
y << [:both, w1, w2]
|
35
|
+
|
36
|
+
if alphabetic?(w1)
|
37
|
+
y << [:w1length, w1length]
|
38
|
+
y << [:w1abbr, w1abbr(model)]
|
39
|
+
end
|
40
|
+
|
41
|
+
if alphabetic?(w2)
|
42
|
+
y << [:w2cap, w2cap]
|
43
|
+
y << [:w2lower, w2lower(model)]
|
44
|
+
y << [:w1w2upper, w1, w2cap]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def over?(threshold)
|
50
|
+
!!pred && pred > threshold
|
51
|
+
end
|
52
|
+
|
53
|
+
protected
|
54
|
+
|
55
|
+
attr_writer :next_word
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
# normalize numbers, discard some punctuation that can be ambiguous
|
60
|
+
def clean(text)
|
61
|
+
text = tokenize(text)
|
62
|
+
text.gsub!(/[.,\d]*\d/, '<NUM>')
|
63
|
+
text.gsub!(%r{[^a-zA-Z0-9,.;:<>\-'/?!$% ]}, '')
|
64
|
+
text.gsub!('--', ' ')
|
65
|
+
text
|
66
|
+
end
|
67
|
+
|
68
|
+
def w1
|
69
|
+
@w1 ||= last_word&.sub(/(^.+?-)/, '')
|
70
|
+
end
|
71
|
+
|
72
|
+
def w2
|
73
|
+
@w2 ||= next_word&.sub(/(-.+?)$/, '')
|
74
|
+
end
|
75
|
+
|
76
|
+
def w1length
|
77
|
+
[10, w1.sub(/\W/, '').length].min
|
78
|
+
end
|
79
|
+
|
80
|
+
def w1abbr(model)
|
81
|
+
Math.log(1 + model.non_abbrs.fetch(w1.chop, 0.0)).to_i
|
82
|
+
end
|
83
|
+
|
84
|
+
def w2cap
|
85
|
+
upcase?(w2.chars.first) ? 'True' : 'False'
|
86
|
+
end
|
87
|
+
|
88
|
+
def w2lower(model)
|
89
|
+
Math.log(1 + model.lower_words.fetch(w2.downcase, 0.0)).to_i
|
90
|
+
end
|
91
|
+
|
92
|
+
def alphabetic?(str)
|
93
|
+
!!/[a-zA-Z. ]+/u.match(str)
|
94
|
+
end
|
95
|
+
|
96
|
+
def upcase?(str)
|
97
|
+
str.upcase == str
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#
|
2
|
+
# Naive Bayes model, with a few tweaks:
|
3
|
+
# - all feature types are pooled together for normalization (this might help
|
4
|
+
# because the independence assumption is so broken for our features)
|
5
|
+
# - smoothing: add 0.1 to all counts
|
6
|
+
# - priors are modified for better performance (this is mysterious but works much better)
|
7
|
+
#
|
8
|
+
module Splitta
|
9
|
+
class Model
|
10
|
+
|
11
|
+
include Singleton
|
12
|
+
|
13
|
+
LABELS = [0, 1]
|
14
|
+
|
15
|
+
attr_reader :feats, :lower_words, :non_abbrs, :prior_probs
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@feats = model_read(:feats)
|
19
|
+
@lower_words = model_read(:lower_words)
|
20
|
+
@non_abbrs = model_read(:non_abbrs)
|
21
|
+
|
22
|
+
@prior_probs = LABELS.each_with_object({}) do |label, h|
|
23
|
+
h[label] = feats[[label, '<prior>']]**4
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def classify(doc)
|
28
|
+
doc.frags.each do |frag|
|
29
|
+
frag.pred = classify_one(frag)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def inspect
|
34
|
+
"#<Splitta::Model:#{object_id}>"
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def classify_one(frag)
|
40
|
+
probs = prior_probs.dup
|
41
|
+
LABELS.each do |label|
|
42
|
+
frag.features(self).each do |f|
|
43
|
+
key = [label, f.join('_')]
|
44
|
+
next unless feats.include?(key)
|
45
|
+
|
46
|
+
probs[label] *= feats[key]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
normalize(probs).fetch(LABELS.last)
|
50
|
+
end
|
51
|
+
|
52
|
+
def normalize(probs)
|
53
|
+
total = probs.values.reduce(:+).to_f
|
54
|
+
probs.transform_values do |value|
|
55
|
+
value / total
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def model_read(name)
|
60
|
+
Zlib::GzipReader.open(File.join(basedir, name.to_s)) do |gz|
|
61
|
+
Marshal.load(gz)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def basedir
|
66
|
+
File.expand_path('../../data', __dir__)
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#
|
2
|
+
# A list of (regexp, repl) pairs applied in sequence.
|
3
|
+
# The resulting string is split on whitespace.
|
4
|
+
# (Adapted from the Punkt Word Tokenizer)
|
5
|
+
#
|
6
|
+
module Splitta
|
7
|
+
module WordTokenizer
|
8
|
+
|
9
|
+
TOKENIZE_REGEXPS = [
|
10
|
+
# uniform quotes
|
11
|
+
/
|
12
|
+
'' |
|
13
|
+
`` |
|
14
|
+
“ |
|
15
|
+
”
|
16
|
+
/ux, '"',
|
17
|
+
|
18
|
+
# Separate punctuation (except period) from words:
|
19
|
+
/(^|\s)(')/, '\1\2 ',
|
20
|
+
/(?<=[("`{\[:;&#*@])(.)/, ' \1', # left-hand punctuation
|
21
|
+
/(.)(?=[?!)";}\]*:@'])/, '\1 ', # right-hand punctuation
|
22
|
+
/(?<=[)}\]])(.)/, ' \1', # left-hand close paren
|
23
|
+
/(.)(?=[({\[])/, '\1 ', # right-hand open paren
|
24
|
+
/((^|\s)-)(?=[^-])/, '\1 ', # starting hyphen/minus
|
25
|
+
|
26
|
+
# Treat double-hyphen as one token:
|
27
|
+
/([^-])(--+)([^-])/, '\1 \2 \3',
|
28
|
+
|
29
|
+
# Only separate comma if space follows:
|
30
|
+
/(\s|^)(,)(?=(\S))/u, '\1\2 ',
|
31
|
+
/(.)(,)(\s|$)/u, '\1 \2\3',
|
32
|
+
|
33
|
+
# Combine dots separated by whitespace to be a single token:
|
34
|
+
/\.\s\.\s\./u, '...',
|
35
|
+
|
36
|
+
# Separate "No.6"
|
37
|
+
/([A-Za-z]\.)(\d+)/, '\1 \2',
|
38
|
+
|
39
|
+
# Separate words from ellipses
|
40
|
+
/([^.]|^)(\.{2,})(.?)/, '\1 \2 \3',
|
41
|
+
/(^|\s)(\.{2,})([^.\s])/u, '\1\2 \3',
|
42
|
+
/(^|\s)(\.{2,})([^.\s])/u, '\1 \2\3',
|
43
|
+
|
44
|
+
# fix %, $, &
|
45
|
+
/(\d)%/, '\1 %',
|
46
|
+
/\$(\.?\d)/, '$ \1',
|
47
|
+
/(\w)& (\w)/, '\1&\2',
|
48
|
+
/(\w\w+)&(\w\w+)/, '\1 & \2',
|
49
|
+
|
50
|
+
# fix (n 't) --> ( n't)
|
51
|
+
/n 't( |$)/, ' n\'t\1',
|
52
|
+
/N 'T( |$)/, ' N\'T\1',
|
53
|
+
|
54
|
+
# treebank tokenizer special words
|
55
|
+
/([Cc])annot/, '\1an not',
|
56
|
+
/\s+/, ' ',
|
57
|
+
]
|
58
|
+
|
59
|
+
#
|
60
|
+
# Tokenize a string using the rules above
|
61
|
+
#
|
62
|
+
def tokenize(text)
|
63
|
+
text = text.dup
|
64
|
+
TOKENIZE_REGEXPS.each_slice(2) do |regexp, repl|
|
65
|
+
text.gsub!(regexp, repl)
|
66
|
+
end
|
67
|
+
text
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
data/splitta.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'splitta/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'splitta'
|
7
|
+
spec.version = Splitta::VERSION
|
8
|
+
spec.authors = ['David McCullars']
|
9
|
+
spec.email = ['david.mccullars@gmail.com']
|
10
|
+
|
11
|
+
spec.summary = 'Implementation of Splitta in Ruby'
|
12
|
+
spec.description = 'Implementation of Splitta in Ruby. See https://code.google.com/archive/p/splitta/'
|
13
|
+
spec.homepage = 'https://github.com/david-mccullars/ruby-splitta'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = 'exe'
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_development_dependency 'bundler'
|
22
|
+
spec.add_development_dependency 'rake'
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
|
+
spec.add_development_dependency 'rubocop'
|
25
|
+
spec.add_development_dependency 'rubypython'
|
26
|
+
spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: splitta
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 4.2.5
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David McCullars
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-06-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rubocop
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubypython
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: simplecov
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.17.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.17.0
|
97
|
+
description: Implementation of Splitta in Ruby. See https://code.google.com/archive/p/splitta/
|
98
|
+
email:
|
99
|
+
- david.mccullars@gmail.com
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- ".codeclimate.yml"
|
105
|
+
- ".gitignore"
|
106
|
+
- ".rubocop.yml"
|
107
|
+
- ".ruby-version"
|
108
|
+
- ".travis.yml"
|
109
|
+
- CODE_OF_CONDUCT.md
|
110
|
+
- Gemfile
|
111
|
+
- LICENSE
|
112
|
+
- README.md
|
113
|
+
- Rakefile
|
114
|
+
- data/feats
|
115
|
+
- data/lower_words
|
116
|
+
- data/non_abbrs
|
117
|
+
- data/src/feats
|
118
|
+
- data/src/lower_words
|
119
|
+
- data/src/non_abbrs
|
120
|
+
- lib/splitta.rb
|
121
|
+
- lib/splitta/doc.rb
|
122
|
+
- lib/splitta/frag.rb
|
123
|
+
- lib/splitta/model.rb
|
124
|
+
- lib/splitta/version.rb
|
125
|
+
- lib/splitta/word_tokenizer.rb
|
126
|
+
- splitta.gemspec
|
127
|
+
homepage: https://github.com/david-mccullars/ruby-splitta
|
128
|
+
licenses:
|
129
|
+
- MIT
|
130
|
+
metadata: {}
|
131
|
+
post_install_message:
|
132
|
+
rdoc_options: []
|
133
|
+
require_paths:
|
134
|
+
- lib
|
135
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
136
|
+
requirements:
|
137
|
+
- - ">="
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: '0'
|
140
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - ">="
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0'
|
145
|
+
requirements: []
|
146
|
+
rubyforge_project:
|
147
|
+
rubygems_version: 2.7.6
|
148
|
+
signing_key:
|
149
|
+
specification_version: 4
|
150
|
+
summary: Implementation of Splitta in Ruby
|
151
|
+
test_files: []
|