splitta 4.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +20 -0
- data/.gitignore +10 -0
- data/.rubocop.yml +135 -0
- data/.ruby-version +1 -0
- data/.travis.yml +15 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +3 -0
- data/LICENSE +21 -0
- data/README.md +47 -0
- data/Rakefile +55 -0
- data/data/feats +0 -0
- data/data/lower_words +0 -0
- data/data/non_abbrs +0 -0
- data/data/src/feats +0 -0
- data/data/src/lower_words +0 -0
- data/data/src/non_abbrs +0 -0
- data/lib/splitta.rb +21 -0
- data/lib/splitta/doc.rb +50 -0
- data/lib/splitta/frag.rb +101 -0
- data/lib/splitta/model.rb +70 -0
- data/lib/splitta/version.rb +6 -0
- data/lib/splitta/word_tokenizer.rb +71 -0
- data/splitta.gemspec +27 -0
- metadata +151 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 6d8e46cae786dd0a359ae4ba07fc88991c8655957635a12e24acc73954005a9c
|
|
4
|
+
data.tar.gz: 835385acb2401790fd2c077607c77b7a45621e02317b8b83fd7cff869898aefc
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 698ac3cbdaf4671b86f931e963c2b1bd7cbb193c381ac4ffb5817b66a6849a8da8a28c73ce2a1a322e6891736765802a80f1e7a43da4ff977e998b97e5adb392
|
|
7
|
+
data.tar.gz: f959bd4dd3a79f687321aafc78fd1e20e8f39964610da758779d7b52038232b3f41e150ea5c11b197b8bd58313c048a956a069eeb5a89e8b4bd3477b2bbe9cd7
|
data/.codeclimate.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
engines:
|
|
3
|
+
duplication:
|
|
4
|
+
enabled: true
|
|
5
|
+
config:
|
|
6
|
+
languages:
|
|
7
|
+
- ruby
|
|
8
|
+
checks:
|
|
9
|
+
Similar code:
|
|
10
|
+
enabled: false
|
|
11
|
+
fixme:
|
|
12
|
+
enabled: true
|
|
13
|
+
rubocop:
|
|
14
|
+
enabled: true
|
|
15
|
+
channel: rubocop-0-85
|
|
16
|
+
ratings:
|
|
17
|
+
paths:
|
|
18
|
+
- "**.rb"
|
|
19
|
+
exclude_paths:
|
|
20
|
+
- spec/
|
data/.gitignore
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
Exclude:
|
|
3
|
+
- splitta.gemspec
|
|
4
|
+
- Rakefile
|
|
5
|
+
|
|
6
|
+
Layout/EmptyLinesAroundAttributeAccessor:
|
|
7
|
+
Enabled: true
|
|
8
|
+
|
|
9
|
+
Layout/EmptyLinesAroundClassBody:
|
|
10
|
+
EnforcedStyle: empty_lines_except_namespace
|
|
11
|
+
|
|
12
|
+
Layout/EmptyLinesAroundModuleBody:
|
|
13
|
+
EnforcedStyle: empty_lines_except_namespace
|
|
14
|
+
|
|
15
|
+
Layout/ExtraSpacing:
|
|
16
|
+
Enabled: false
|
|
17
|
+
|
|
18
|
+
Layout/LineLength:
|
|
19
|
+
Max: 120
|
|
20
|
+
Enabled: false
|
|
21
|
+
|
|
22
|
+
Layout/SpaceAroundMethodCallOperator:
|
|
23
|
+
Enabled: true
|
|
24
|
+
|
|
25
|
+
Lint/DeprecatedOpenSSLConstant:
|
|
26
|
+
Enabled: true
|
|
27
|
+
|
|
28
|
+
Lint/MixedRegexpCaptureTypes:
|
|
29
|
+
Enabled: true
|
|
30
|
+
|
|
31
|
+
Lint/RaiseException:
|
|
32
|
+
Enabled: true
|
|
33
|
+
|
|
34
|
+
Lint/StructNewOverride:
|
|
35
|
+
Enabled: true
|
|
36
|
+
|
|
37
|
+
Metrics/AbcSize:
|
|
38
|
+
Max: 50
|
|
39
|
+
Enabled: false
|
|
40
|
+
|
|
41
|
+
Metrics/BlockLength:
|
|
42
|
+
Max: 50
|
|
43
|
+
Enabled: false
|
|
44
|
+
|
|
45
|
+
Metrics/ClassLength:
|
|
46
|
+
Max: 50
|
|
47
|
+
Enabled: false
|
|
48
|
+
|
|
49
|
+
Metrics/CyclomaticComplexity:
|
|
50
|
+
Max: 30
|
|
51
|
+
Enabled: false
|
|
52
|
+
|
|
53
|
+
Metrics/MethodLength:
|
|
54
|
+
Max: 20
|
|
55
|
+
Enabled: false
|
|
56
|
+
|
|
57
|
+
Metrics/ModuleLength:
|
|
58
|
+
Max: 1000
|
|
59
|
+
Enabled: false
|
|
60
|
+
|
|
61
|
+
Metrics/PerceivedComplexity:
|
|
62
|
+
Max: 30
|
|
63
|
+
Enabled: false
|
|
64
|
+
|
|
65
|
+
Security/MarshalLoad:
|
|
66
|
+
Enabled: false
|
|
67
|
+
|
|
68
|
+
Style/AndOr:
|
|
69
|
+
Enabled: false
|
|
70
|
+
|
|
71
|
+
Style/CaseEquality:
|
|
72
|
+
Enabled: false
|
|
73
|
+
|
|
74
|
+
Style/Documentation:
|
|
75
|
+
Enabled: false
|
|
76
|
+
|
|
77
|
+
Style/DoubleNegation:
|
|
78
|
+
Enabled: false
|
|
79
|
+
|
|
80
|
+
Style/ExponentialNotation:
|
|
81
|
+
Enabled: true
|
|
82
|
+
|
|
83
|
+
Style/FrozenStringLiteralComment:
|
|
84
|
+
Enabled: false
|
|
85
|
+
|
|
86
|
+
Style/GuardClause:
|
|
87
|
+
Enabled: false
|
|
88
|
+
|
|
89
|
+
Style/HashEachMethods:
|
|
90
|
+
Enabled: true
|
|
91
|
+
|
|
92
|
+
Style/HashTransformKeys:
|
|
93
|
+
Enabled: true
|
|
94
|
+
|
|
95
|
+
Style/HashTransformValues:
|
|
96
|
+
Enabled: true
|
|
97
|
+
|
|
98
|
+
Style/IfUnlessModifier:
|
|
99
|
+
Enabled: false
|
|
100
|
+
|
|
101
|
+
Style/MultilineBlockChain:
|
|
102
|
+
Enabled: false
|
|
103
|
+
|
|
104
|
+
Style/MultilineIfModifier:
|
|
105
|
+
Enabled: false
|
|
106
|
+
|
|
107
|
+
Style/MutableConstant:
|
|
108
|
+
Enabled: false
|
|
109
|
+
|
|
110
|
+
Style/RedundantRegexpCharacterClass:
|
|
111
|
+
Enabled: true
|
|
112
|
+
|
|
113
|
+
Style/RedundantRegexpEscape:
|
|
114
|
+
Enabled: true
|
|
115
|
+
|
|
116
|
+
Style/RescueModifier:
|
|
117
|
+
Enabled: false
|
|
118
|
+
|
|
119
|
+
Style/RescueStandardError:
|
|
120
|
+
Enabled: false
|
|
121
|
+
|
|
122
|
+
Style/SlicingWithRange:
|
|
123
|
+
Enabled: true
|
|
124
|
+
|
|
125
|
+
Style/TrailingCommaInArguments:
|
|
126
|
+
EnforcedStyleForMultiline: comma
|
|
127
|
+
|
|
128
|
+
Style/TrailingCommaInArrayLiteral:
|
|
129
|
+
EnforcedStyleForMultiline: consistent_comma
|
|
130
|
+
|
|
131
|
+
Style/TrailingCommaInHashLiteral:
|
|
132
|
+
EnforcedStyleForMultiline: consistent_comma
|
|
133
|
+
|
|
134
|
+
Style/ZeroLengthPredicate:
|
|
135
|
+
Enabled: false
|
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ruby-2.5.1
|
data/.travis.yml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
env:
|
|
2
|
+
global:
|
|
3
|
+
- CC_TEST_REPORTER_ID=35dd438da367278dc0d80cb7e21b3b0cd6eb207f60890a05c3fe1f0882981e64
|
|
4
|
+
language: ruby
|
|
5
|
+
rvm:
|
|
6
|
+
- 2.5.1
|
|
7
|
+
before_install: gem install bundler -v 1.17.3
|
|
8
|
+
before_script:
|
|
9
|
+
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
|
10
|
+
- chmod +x ./cc-test-reporter
|
|
11
|
+
- ./cc-test-reporter before-build
|
|
12
|
+
script:
|
|
13
|
+
- bundle exec rspec
|
|
14
|
+
after_script:
|
|
15
|
+
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|
data/CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Contributor Code of Conduct
|
|
2
|
+
|
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
|
7
|
+
|
|
8
|
+
We are committed to making participation in this project a harassment-free
|
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
|
12
|
+
|
|
13
|
+
Examples of unacceptable behavior by participants include:
|
|
14
|
+
|
|
15
|
+
* The use of sexualized language or imagery
|
|
16
|
+
* Personal attacks
|
|
17
|
+
* Trolling or insulting/derogatory comments
|
|
18
|
+
* Public or private harassment
|
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
|
20
|
+
addresses, without explicit permission
|
|
21
|
+
* Other unethical or unprofessional conduct
|
|
22
|
+
|
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
|
27
|
+
threatening, offensive, or harmful.
|
|
28
|
+
|
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
|
32
|
+
Conduct may be permanently removed from the project team.
|
|
33
|
+
|
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
|
35
|
+
when an individual is representing the project or its community.
|
|
36
|
+
|
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
|
38
|
+
reported by contacting a project maintainer at david@bloomfire.com. All
|
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
|
42
|
+
incident.
|
|
43
|
+
|
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
|
45
|
+
version 1.3.0, available at
|
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
|
47
|
+
|
|
48
|
+
[homepage]: http://contributor-covenant.org
|
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2020 David McCullars
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Ruby Splitta
|
|
2
|
+
|
|
3
|
+
* README: https://github.com/david-mccullars/ruby-splitta
|
|
4
|
+
* Documentation: http://www.rubydoc.info/github/david-mccullars/ruby-splitta
|
|
5
|
+
* Bug Reports: https://github.com/david-mccullars/ruby-splitta/issues
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
## Status
|
|
9
|
+
|
|
10
|
+
[](https://badge.fury.io/rb/splitta)
|
|
11
|
+
[](https://travis-ci.org/david-mccullars/ruby-splitta)
|
|
12
|
+
[](https://codeclimate.com/github/david-mccullars/ruby-splitta)
|
|
13
|
+
[](https://codeclimate.com/github/david-mccullars/ruby-splitta/coverage)
|
|
14
|
+
|
|
15
|
+
## Description
|
|
16
|
+
|
|
17
|
+
[Splitta](https://code.google.com/archive/p/splitta/) Includes proper
|
|
18
|
+
tokenization and models for very high accuracy sentence boundary detection
|
|
19
|
+
(English only for now). The models are trained from Wall Street Journal news
|
|
20
|
+
combined with the Brown Corpus which is intended to be widely representative of
|
|
21
|
+
written English. Error rates on test news data are near 0.25%.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
gem install splitta
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Requirements
|
|
30
|
+
|
|
31
|
+
* Ruby 2.5.1 or higher
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
```ruby
|
|
36
|
+
require 'splitta'
|
|
37
|
+
|
|
38
|
+
Splitta.sentences("Some text goes here.")
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## License
|
|
42
|
+
|
|
43
|
+
MIT. See the `LICENSE` file.
|
|
44
|
+
|
|
45
|
+
## References
|
|
46
|
+
|
|
47
|
+
> Dan Gillick, “Sentence Boundary Detection and the Problem with the U.S.” at NAACL 2009, http://dgillick.com/resource/sbd_naacl_2009.pdf
|
data/Rakefile
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
require "bundler/gem_tasks"
|
|
2
|
+
require "rspec/core/rake_task"
|
|
3
|
+
|
|
4
|
+
RSpec::Core::RakeTask.new(:spec)
|
|
5
|
+
|
|
6
|
+
task :default => :spec
|
|
7
|
+
|
|
8
|
+
require 'rdoc/task'
|
|
9
|
+
RDoc::Task.new do |rdoc|
|
|
10
|
+
rdoc.main = "README.md"
|
|
11
|
+
rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def unpickle(file)
|
|
15
|
+
require 'rubypython'
|
|
16
|
+
|
|
17
|
+
RubyPython.run do
|
|
18
|
+
cPickle = import 'cPickle'
|
|
19
|
+
gzip = import 'gzip'
|
|
20
|
+
|
|
21
|
+
io = gzip.open(file, 'rb')
|
|
22
|
+
data = cPickle.load(io)
|
|
23
|
+
io.close()
|
|
24
|
+
|
|
25
|
+
data = data.rubify
|
|
26
|
+
if data.keys.first.is_a?(RubyPython::Tuple)
|
|
27
|
+
data = data.each_with_object({}) do |(k, v), h|
|
|
28
|
+
h[k.to_a] = v
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
return data
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def gzip_dump(file, obj)
|
|
37
|
+
require 'fileutils'
|
|
38
|
+
require 'zlib'
|
|
39
|
+
|
|
40
|
+
FileUtils.mkdir_p(File.dirname(file))
|
|
41
|
+
Zlib::GzipWriter.open(file) do |gz|
|
|
42
|
+
gz.write(Marshal.dump(obj))
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
task :unpickle, [:file] do |_t, args|
|
|
47
|
+
files = Dir[args[:file] || 'data/src/**/*'].select do |f|
|
|
48
|
+
File.file?(f)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
files.each do |src|
|
|
52
|
+
puts "Unpickling #{src} ..."
|
|
53
|
+
gzip_dump(src.sub('/src/', '/'), unpickle(src))
|
|
54
|
+
end
|
|
55
|
+
end
|
data/data/feats
ADDED
|
Binary file
|
data/data/lower_words
ADDED
|
Binary file
|
data/data/non_abbrs
ADDED
|
Binary file
|
data/data/src/feats
ADDED
|
Binary file
|
|
Binary file
|
data/data/src/non_abbrs
ADDED
|
Binary file
|
data/lib/splitta.rb
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'singleton'
|
|
2
|
+
require 'zlib'
|
|
3
|
+
|
|
4
|
+
##
|
|
5
|
+
# Provides convenience methods for splitting text into sentences.
|
|
6
|
+
#
|
|
7
|
+
# @see README
|
|
8
|
+
##
|
|
9
|
+
module Splitta
|
|
10
|
+
|
|
11
|
+
autoload :Doc, 'splitta/doc'
|
|
12
|
+
autoload :Frag, 'splitta/frag'
|
|
13
|
+
autoload :Model, 'splitta/model'
|
|
14
|
+
autoload :VERSION, 'splitta/version'
|
|
15
|
+
autoload :WordTokenizer, 'splitta/word_tokenizer'
|
|
16
|
+
|
|
17
|
+
def self.sentences(text)
|
|
18
|
+
Doc.new(text, model: Model.instance).segments.map(&:strip)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
end
|
data/lib/splitta/doc.rb
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#
|
|
2
|
+
# A Document points to a collection of Frags
|
|
3
|
+
#
|
|
4
|
+
module Splitta
|
|
5
|
+
class Doc
|
|
6
|
+
|
|
7
|
+
FRAG_SPLITTER = /
|
|
8
|
+
(
|
|
9
|
+
[.!?] # sentence end punctuation
|
|
10
|
+
(?:
|
|
11
|
+
(?:<.*>) # extra tag
|
|
12
|
+
|
|
|
13
|
+
[”"')\]}] # right-handed punctuation to retain
|
|
14
|
+
)*
|
|
15
|
+
\s+ # must have whitespace
|
|
16
|
+
)
|
|
17
|
+
/ux
|
|
18
|
+
|
|
19
|
+
SEGMENT_THRESHOLD = 0.5
|
|
20
|
+
|
|
21
|
+
attr_reader :frags
|
|
22
|
+
|
|
23
|
+
def initialize(text, model:)
|
|
24
|
+
@frags = []
|
|
25
|
+
text.split(FRAG_SPLITTER).each_slice(2) do |frag_text|
|
|
26
|
+
frag = Frag.new(frag_text.join, previous_frag: @frags.last)
|
|
27
|
+
@frags << frag
|
|
28
|
+
end
|
|
29
|
+
model.classify(self)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
#
|
|
33
|
+
# output all the text, split according to predictions
|
|
34
|
+
#
|
|
35
|
+
def segments
|
|
36
|
+
Enumerator.new do |y|
|
|
37
|
+
io = StringIO.new
|
|
38
|
+
frags.each do |frag|
|
|
39
|
+
io << frag.orig
|
|
40
|
+
if frag.over?(SEGMENT_THRESHOLD)
|
|
41
|
+
y << io.string
|
|
42
|
+
io.string = ''
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
y << io.string unless io.string.empty?
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
end
|
|
50
|
+
end
|
data/lib/splitta/frag.rb
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
#
|
|
2
|
+
# A fragment of text that ends with a possible sentence boundary
|
|
3
|
+
#
|
|
4
|
+
module Splitta
|
|
5
|
+
class Frag
|
|
6
|
+
|
|
7
|
+
include WordTokenizer
|
|
8
|
+
|
|
9
|
+
attr_reader :orig, :last_word, :next_word
|
|
10
|
+
attr_accessor :pred
|
|
11
|
+
|
|
12
|
+
def initialize(orig, previous_frag: nil)
|
|
13
|
+
words = clean(orig).split
|
|
14
|
+
previous_frag.next_word = words.first if previous_frag
|
|
15
|
+
@orig = orig
|
|
16
|
+
@last_word = words.last
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# ... w1. (sb?) w2 ...
|
|
20
|
+
# Features, listed roughly in order of importance:
|
|
21
|
+
#
|
|
22
|
+
# (1) w1: word that includes a period
|
|
23
|
+
# (2) w2: the next word, if it exists
|
|
24
|
+
# (3) w1length: number of alphabetic characters in w1
|
|
25
|
+
# (4) w2cap: true if w2 is capitalized
|
|
26
|
+
# (5) both: w1 and w2
|
|
27
|
+
# (6) w1abbr: log count of w1 in training without a final period
|
|
28
|
+
# (7) w2lower: log count of w2 in training as lowercased
|
|
29
|
+
# (8) w1w2upper: w1 and w2 is capitalized
|
|
30
|
+
def features(model)
|
|
31
|
+
Enumerator.new do |y|
|
|
32
|
+
y << [:w1, w1]
|
|
33
|
+
y << [:w2, w2]
|
|
34
|
+
y << [:both, w1, w2]
|
|
35
|
+
|
|
36
|
+
if alphabetic?(w1)
|
|
37
|
+
y << [:w1length, w1length]
|
|
38
|
+
y << [:w1abbr, w1abbr(model)]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
if alphabetic?(w2)
|
|
42
|
+
y << [:w2cap, w2cap]
|
|
43
|
+
y << [:w2lower, w2lower(model)]
|
|
44
|
+
y << [:w1w2upper, w1, w2cap]
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def over?(threshold)
|
|
50
|
+
!!pred && pred > threshold
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
protected
|
|
54
|
+
|
|
55
|
+
attr_writer :next_word
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
# normalize numbers, discard some punctuation that can be ambiguous
|
|
60
|
+
def clean(text)
|
|
61
|
+
text = tokenize(text)
|
|
62
|
+
text.gsub!(/[.,\d]*\d/, '<NUM>')
|
|
63
|
+
text.gsub!(%r{[^a-zA-Z0-9,.;:<>\-'/?!$% ]}, '')
|
|
64
|
+
text.gsub!('--', ' ')
|
|
65
|
+
text
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def w1
|
|
69
|
+
@w1 ||= last_word&.sub(/(^.+?-)/, '')
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def w2
|
|
73
|
+
@w2 ||= next_word&.sub(/(-.+?)$/, '')
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def w1length
|
|
77
|
+
[10, w1.sub(/\W/, '').length].min
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def w1abbr(model)
|
|
81
|
+
Math.log(1 + model.non_abbrs.fetch(w1.chop, 0.0)).to_i
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def w2cap
|
|
85
|
+
upcase?(w2.chars.first) ? 'True' : 'False'
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def w2lower(model)
|
|
89
|
+
Math.log(1 + model.lower_words.fetch(w2.downcase, 0.0)).to_i
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def alphabetic?(str)
|
|
93
|
+
!!/[a-zA-Z. ]+/u.match(str)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def upcase?(str)
|
|
97
|
+
str.upcase == str
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Naive Bayes model, with a few tweaks:
|
|
3
|
+
# - all feature types are pooled together for normalization (this might help
|
|
4
|
+
# because the independence assumption is so broken for our features)
|
|
5
|
+
# - smoothing: add 0.1 to all counts
|
|
6
|
+
# - priors are modified for better performance (this is mysterious but works much better)
|
|
7
|
+
#
|
|
8
|
+
module Splitta
|
|
9
|
+
class Model
|
|
10
|
+
|
|
11
|
+
include Singleton
|
|
12
|
+
|
|
13
|
+
LABELS = [0, 1]
|
|
14
|
+
|
|
15
|
+
attr_reader :feats, :lower_words, :non_abbrs, :prior_probs
|
|
16
|
+
|
|
17
|
+
def initialize
|
|
18
|
+
@feats = model_read(:feats)
|
|
19
|
+
@lower_words = model_read(:lower_words)
|
|
20
|
+
@non_abbrs = model_read(:non_abbrs)
|
|
21
|
+
|
|
22
|
+
@prior_probs = LABELS.each_with_object({}) do |label, h|
|
|
23
|
+
h[label] = feats[[label, '<prior>']]**4
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def classify(doc)
|
|
28
|
+
doc.frags.each do |frag|
|
|
29
|
+
frag.pred = classify_one(frag)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def inspect
|
|
34
|
+
"#<Splitta::Model:#{object_id}>"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def classify_one(frag)
|
|
40
|
+
probs = prior_probs.dup
|
|
41
|
+
LABELS.each do |label|
|
|
42
|
+
frag.features(self).each do |f|
|
|
43
|
+
key = [label, f.join('_')]
|
|
44
|
+
next unless feats.include?(key)
|
|
45
|
+
|
|
46
|
+
probs[label] *= feats[key]
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
normalize(probs).fetch(LABELS.last)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def normalize(probs)
|
|
53
|
+
total = probs.values.reduce(:+).to_f
|
|
54
|
+
probs.transform_values do |value|
|
|
55
|
+
value / total
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def model_read(name)
|
|
60
|
+
Zlib::GzipReader.open(File.join(basedir, name.to_s)) do |gz|
|
|
61
|
+
Marshal.load(gz)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def basedir
|
|
66
|
+
File.expand_path('../../data', __dir__)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
#
|
|
2
|
+
# A list of (regexp, repl) pairs applied in sequence.
|
|
3
|
+
# The resulting string is split on whitespace.
|
|
4
|
+
# (Adapted from the Punkt Word Tokenizer)
|
|
5
|
+
#
|
|
6
|
+
module Splitta
|
|
7
|
+
module WordTokenizer
|
|
8
|
+
|
|
9
|
+
TOKENIZE_REGEXPS = [
|
|
10
|
+
# uniform quotes
|
|
11
|
+
/
|
|
12
|
+
'' |
|
|
13
|
+
`` |
|
|
14
|
+
“ |
|
|
15
|
+
”
|
|
16
|
+
/ux, '"',
|
|
17
|
+
|
|
18
|
+
# Separate punctuation (except period) from words:
|
|
19
|
+
/(^|\s)(')/, '\1\2 ',
|
|
20
|
+
/(?<=[("`{\[:;&#*@])(.)/, ' \1', # left-hand punctuation
|
|
21
|
+
/(.)(?=[?!)";}\]*:@'])/, '\1 ', # right-hand punctuation
|
|
22
|
+
/(?<=[)}\]])(.)/, ' \1', # left-hand close paren
|
|
23
|
+
/(.)(?=[({\[])/, '\1 ', # right-hand open paren
|
|
24
|
+
/((^|\s)-)(?=[^-])/, '\1 ', # starting hyphen/minus
|
|
25
|
+
|
|
26
|
+
# Treat double-hyphen as one token:
|
|
27
|
+
/([^-])(--+)([^-])/, '\1 \2 \3',
|
|
28
|
+
|
|
29
|
+
# Only separate comma if space follows:
|
|
30
|
+
/(\s|^)(,)(?=(\S))/u, '\1\2 ',
|
|
31
|
+
/(.)(,)(\s|$)/u, '\1 \2\3',
|
|
32
|
+
|
|
33
|
+
# Combine dots separated by whitespace to be a single token:
|
|
34
|
+
/\.\s\.\s\./u, '...',
|
|
35
|
+
|
|
36
|
+
# Separate "No.6"
|
|
37
|
+
/([A-Za-z]\.)(\d+)/, '\1 \2',
|
|
38
|
+
|
|
39
|
+
# Separate words from ellipses
|
|
40
|
+
/([^.]|^)(\.{2,})(.?)/, '\1 \2 \3',
|
|
41
|
+
/(^|\s)(\.{2,})([^.\s])/u, '\1\2 \3',
|
|
42
|
+
/(^|\s)(\.{2,})([^.\s])/u, '\1 \2\3',
|
|
43
|
+
|
|
44
|
+
# fix %, $, &
|
|
45
|
+
/(\d)%/, '\1 %',
|
|
46
|
+
/\$(\.?\d)/, '$ \1',
|
|
47
|
+
/(\w)& (\w)/, '\1&\2',
|
|
48
|
+
/(\w\w+)&(\w\w+)/, '\1 & \2',
|
|
49
|
+
|
|
50
|
+
# fix (n 't) --> ( n't)
|
|
51
|
+
/n 't( |$)/, ' n\'t\1',
|
|
52
|
+
/N 'T( |$)/, ' N\'T\1',
|
|
53
|
+
|
|
54
|
+
# treebank tokenizer special words
|
|
55
|
+
/([Cc])annot/, '\1an not',
|
|
56
|
+
/\s+/, ' ',
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
#
|
|
60
|
+
# Tokenize a string using the rules above
|
|
61
|
+
#
|
|
62
|
+
def tokenize(text)
|
|
63
|
+
text = text.dup
|
|
64
|
+
TOKENIZE_REGEXPS.each_slice(2) do |regexp, repl|
|
|
65
|
+
text.gsub!(regexp, repl)
|
|
66
|
+
end
|
|
67
|
+
text
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
end
|
|
71
|
+
end
|
data/splitta.gemspec
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
|
+
require 'splitta/version'
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = 'splitta'
|
|
7
|
+
spec.version = Splitta::VERSION
|
|
8
|
+
spec.authors = ['David McCullars']
|
|
9
|
+
spec.email = ['david.mccullars@gmail.com']
|
|
10
|
+
|
|
11
|
+
spec.summary = 'Implementation of Splitta in Ruby'
|
|
12
|
+
spec.description = 'Implementation of Splitta in Ruby. See https://code.google.com/archive/p/splitta/'
|
|
13
|
+
spec.homepage = 'https://github.com/david-mccullars/ruby-splitta'
|
|
14
|
+
spec.license = 'MIT'
|
|
15
|
+
|
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
17
|
+
spec.bindir = 'exe'
|
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
19
|
+
spec.require_paths = ['lib']
|
|
20
|
+
|
|
21
|
+
spec.add_development_dependency 'bundler'
|
|
22
|
+
spec.add_development_dependency 'rake'
|
|
23
|
+
spec.add_development_dependency 'rspec'
|
|
24
|
+
spec.add_development_dependency 'rubocop'
|
|
25
|
+
spec.add_development_dependency 'rubypython'
|
|
26
|
+
spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
|
|
27
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: splitta
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 4.2.5
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- David McCullars
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2020-06-10 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: bundler
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :development
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rake
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
type: :development
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: rspec
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: rubocop
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - ">="
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '0'
|
|
62
|
+
type: :development
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - ">="
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '0'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: rubypython
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - ">="
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '0'
|
|
76
|
+
type: :development
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - ">="
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '0'
|
|
83
|
+
- !ruby/object:Gem::Dependency
|
|
84
|
+
name: simplecov
|
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - "~>"
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: 0.17.0
|
|
90
|
+
type: :development
|
|
91
|
+
prerelease: false
|
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - "~>"
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: 0.17.0
|
|
97
|
+
description: Implementation of Splitta in Ruby. See https://code.google.com/archive/p/splitta/
|
|
98
|
+
email:
|
|
99
|
+
- david.mccullars@gmail.com
|
|
100
|
+
executables: []
|
|
101
|
+
extensions: []
|
|
102
|
+
extra_rdoc_files: []
|
|
103
|
+
files:
|
|
104
|
+
- ".codeclimate.yml"
|
|
105
|
+
- ".gitignore"
|
|
106
|
+
- ".rubocop.yml"
|
|
107
|
+
- ".ruby-version"
|
|
108
|
+
- ".travis.yml"
|
|
109
|
+
- CODE_OF_CONDUCT.md
|
|
110
|
+
- Gemfile
|
|
111
|
+
- LICENSE
|
|
112
|
+
- README.md
|
|
113
|
+
- Rakefile
|
|
114
|
+
- data/feats
|
|
115
|
+
- data/lower_words
|
|
116
|
+
- data/non_abbrs
|
|
117
|
+
- data/src/feats
|
|
118
|
+
- data/src/lower_words
|
|
119
|
+
- data/src/non_abbrs
|
|
120
|
+
- lib/splitta.rb
|
|
121
|
+
- lib/splitta/doc.rb
|
|
122
|
+
- lib/splitta/frag.rb
|
|
123
|
+
- lib/splitta/model.rb
|
|
124
|
+
- lib/splitta/version.rb
|
|
125
|
+
- lib/splitta/word_tokenizer.rb
|
|
126
|
+
- splitta.gemspec
|
|
127
|
+
homepage: https://github.com/david-mccullars/ruby-splitta
|
|
128
|
+
licenses:
|
|
129
|
+
- MIT
|
|
130
|
+
metadata: {}
|
|
131
|
+
post_install_message:
|
|
132
|
+
rdoc_options: []
|
|
133
|
+
require_paths:
|
|
134
|
+
- lib
|
|
135
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
136
|
+
requirements:
|
|
137
|
+
- - ">="
|
|
138
|
+
- !ruby/object:Gem::Version
|
|
139
|
+
version: '0'
|
|
140
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
141
|
+
requirements:
|
|
142
|
+
- - ">="
|
|
143
|
+
- !ruby/object:Gem::Version
|
|
144
|
+
version: '0'
|
|
145
|
+
requirements: []
|
|
146
|
+
rubyforge_project:
|
|
147
|
+
rubygems_version: 2.7.6
|
|
148
|
+
signing_key:
|
|
149
|
+
specification_version: 4
|
|
150
|
+
summary: Implementation of Splitta in Ruby
|
|
151
|
+
test_files: []
|