splitta 4.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6d8e46cae786dd0a359ae4ba07fc88991c8655957635a12e24acc73954005a9c
4
+ data.tar.gz: 835385acb2401790fd2c077607c77b7a45621e02317b8b83fd7cff869898aefc
5
+ SHA512:
6
+ metadata.gz: 698ac3cbdaf4671b86f931e963c2b1bd7cbb193c381ac4ffb5817b66a6849a8da8a28c73ce2a1a322e6891736765802a80f1e7a43da4ff977e998b97e5adb392
7
+ data.tar.gz: f959bd4dd3a79f687321aafc78fd1e20e8f39964610da758779d7b52038232b3f41e150ea5c11b197b8bd58313c048a956a069eeb5a89e8b4bd3477b2bbe9cd7
@@ -0,0 +1,20 @@
1
+ ---
2
+ engines:
3
+ duplication:
4
+ enabled: true
5
+ config:
6
+ languages:
7
+ - ruby
8
+ checks:
9
+ Similar code:
10
+ enabled: false
11
+ fixme:
12
+ enabled: true
13
+ rubocop:
14
+ enabled: true
15
+ channel: rubocop-0-85
16
+ ratings:
17
+ paths:
18
+ - "**.rb"
19
+ exclude_paths:
20
+ - spec/
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /html/
7
+ /doc/
8
+ /pkg/
9
+ /spec/reports/
10
+ /tmp/
@@ -0,0 +1,135 @@
1
+ AllCops:
2
+ Exclude:
3
+ - splitta.gemspec
4
+ - Rakefile
5
+
6
+ Layout/EmptyLinesAroundAttributeAccessor:
7
+ Enabled: true
8
+
9
+ Layout/EmptyLinesAroundClassBody:
10
+ EnforcedStyle: empty_lines_except_namespace
11
+
12
+ Layout/EmptyLinesAroundModuleBody:
13
+ EnforcedStyle: empty_lines_except_namespace
14
+
15
+ Layout/ExtraSpacing:
16
+ Enabled: false
17
+
18
+ Layout/LineLength:
19
+ Max: 120
20
+ Enabled: false
21
+
22
+ Layout/SpaceAroundMethodCallOperator:
23
+ Enabled: true
24
+
25
+ Lint/DeprecatedOpenSSLConstant:
26
+ Enabled: true
27
+
28
+ Lint/MixedRegexpCaptureTypes:
29
+ Enabled: true
30
+
31
+ Lint/RaiseException:
32
+ Enabled: true
33
+
34
+ Lint/StructNewOverride:
35
+ Enabled: true
36
+
37
+ Metrics/AbcSize:
38
+ Max: 50
39
+ Enabled: false
40
+
41
+ Metrics/BlockLength:
42
+ Max: 50
43
+ Enabled: false
44
+
45
+ Metrics/ClassLength:
46
+ Max: 50
47
+ Enabled: false
48
+
49
+ Metrics/CyclomaticComplexity:
50
+ Max: 30
51
+ Enabled: false
52
+
53
+ Metrics/MethodLength:
54
+ Max: 20
55
+ Enabled: false
56
+
57
+ Metrics/ModuleLength:
58
+ Max: 1000
59
+ Enabled: false
60
+
61
+ Metrics/PerceivedComplexity:
62
+ Max: 30
63
+ Enabled: false
64
+
65
+ Security/MarshalLoad:
66
+ Enabled: false
67
+
68
+ Style/AndOr:
69
+ Enabled: false
70
+
71
+ Style/CaseEquality:
72
+ Enabled: false
73
+
74
+ Style/Documentation:
75
+ Enabled: false
76
+
77
+ Style/DoubleNegation:
78
+ Enabled: false
79
+
80
+ Style/ExponentialNotation:
81
+ Enabled: true
82
+
83
+ Style/FrozenStringLiteralComment:
84
+ Enabled: false
85
+
86
+ Style/GuardClause:
87
+ Enabled: false
88
+
89
+ Style/HashEachMethods:
90
+ Enabled: true
91
+
92
+ Style/HashTransformKeys:
93
+ Enabled: true
94
+
95
+ Style/HashTransformValues:
96
+ Enabled: true
97
+
98
+ Style/IfUnlessModifier:
99
+ Enabled: false
100
+
101
+ Style/MultilineBlockChain:
102
+ Enabled: false
103
+
104
+ Style/MultilineIfModifier:
105
+ Enabled: false
106
+
107
+ Style/MutableConstant:
108
+ Enabled: false
109
+
110
+ Style/RedundantRegexpCharacterClass:
111
+ Enabled: true
112
+
113
+ Style/RedundantRegexpEscape:
114
+ Enabled: true
115
+
116
+ Style/RescueModifier:
117
+ Enabled: false
118
+
119
+ Style/RescueStandardError:
120
+ Enabled: false
121
+
122
+ Style/SlicingWithRange:
123
+ Enabled: true
124
+
125
+ Style/TrailingCommaInArguments:
126
+ EnforcedStyleForMultiline: comma
127
+
128
+ Style/TrailingCommaInArrayLiteral:
129
+ EnforcedStyleForMultiline: consistent_comma
130
+
131
+ Style/TrailingCommaInHashLiteral:
132
+ EnforcedStyleForMultiline: consistent_comma
133
+
134
+ Style/ZeroLengthPredicate:
135
+ Enabled: false
@@ -0,0 +1 @@
1
+ ruby-2.5.1
@@ -0,0 +1,15 @@
1
+ env:
2
+ global:
3
+ - CC_TEST_REPORTER_ID=35dd438da367278dc0d80cb7e21b3b0cd6eb207f60890a05c3fe1f0882981e64
4
+ language: ruby
5
+ rvm:
6
+ - 2.5.1
7
+ before_install: gem install bundler -v 1.17.3
8
+ before_script:
9
+ - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
10
+ - chmod +x ./cc-test-reporter
11
+ - ./cc-test-reporter before-build
12
+ script:
13
+ - bundle exec rspec
14
+ after_script:
15
+ - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at david@bloomfire.com. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 David McCullars
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,47 @@
1
+ # Ruby Splitta
2
+
3
+ * README: https://github.com/david-mccullars/ruby-splitta
4
+ * Documentation: http://www.rubydoc.info/github/david-mccullars/ruby-splitta
5
+ * Bug Reports: https://github.com/david-mccullars/ruby-splitta/issues
6
+
7
+
8
+ ## Status
9
+
10
+ [![Gem Version](https://badge.fury.io/rb/splitta.svg)](https://badge.fury.io/rb/splitta)
11
+ [![Travis Build Status](https://travis-ci.org/david-mccullars/ruby-splitta.svg?branch=master)](https://travis-ci.org/david-mccullars/ruby-splitta)
12
+ [![Code Climate](https://codeclimate.com/github/david-mccullars/ruby-splitta/badges/gpa.svg)](https://codeclimate.com/github/david-mccullars/ruby-splitta)
13
+ [![Test Coverage](https://codeclimate.com/github/david-mccullars/ruby-splitta/badges/coverage.svg)](https://codeclimate.com/github/david-mccullars/ruby-splitta/coverage)
14
+
15
+ ## Description
16
+
17
+ [Splitta](https://code.google.com/archive/p/splitta/) Includes proper
18
+ tokenization and models for very high accuracy sentence boundary detection
19
+ (English only for now). The models are trained from Wall Street Journal news
20
+ combined with the Brown Corpus which is intended to be widely representative of
21
+ written English. Error rates on test news data are near 0.25%.
22
+
23
+ ## Installation
24
+
25
+ ```
26
+ gem install splitta
27
+ ```
28
+
29
+ ## Requirements
30
+
31
+ * Ruby 2.5.1 or higher
32
+
33
+ ## Usage
34
+
35
+ ```ruby
36
+ require 'splitta'
37
+
38
+ Splitta.sentences("Some text goes here.")
39
+ ```
40
+
41
+ ## License
42
+
43
+ MIT. See the `LICENSE` file.
44
+
45
+ ## References
46
+
47
+ > Dan Gillick, “Sentence Boundary Detection and the Problem with the U.S.” at NAACL 2009, http://dgillick.com/resource/sbd_naacl_2009.pdf
@@ -0,0 +1,55 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
8
+ require 'rdoc/task'
9
+ RDoc::Task.new do |rdoc|
10
+ rdoc.main = "README.md"
11
+ rdoc.rdoc_files.include("README.md", "lib/**/*.rb")
12
+ end
13
+
14
+ def unpickle(file)
15
+ require 'rubypython'
16
+
17
+ RubyPython.run do
18
+ cPickle = import 'cPickle'
19
+ gzip = import 'gzip'
20
+
21
+ io = gzip.open(file, 'rb')
22
+ data = cPickle.load(io)
23
+ io.close()
24
+
25
+ data = data.rubify
26
+ if data.keys.first.is_a?(RubyPython::Tuple)
27
+ data = data.each_with_object({}) do |(k, v), h|
28
+ h[k.to_a] = v
29
+ end
30
+ end
31
+
32
+ return data
33
+ end
34
+ end
35
+
36
+ def gzip_dump(file, obj)
37
+ require 'fileutils'
38
+ require 'zlib'
39
+
40
+ FileUtils.mkdir_p(File.dirname(file))
41
+ Zlib::GzipWriter.open(file) do |gz|
42
+ gz.write(Marshal.dump(obj))
43
+ end
44
+ end
45
+
46
+ task :unpickle, [:file] do |_t, args|
47
+ files = Dir[args[:file] || 'data/src/**/*'].select do |f|
48
+ File.file?(f)
49
+ end
50
+
51
+ files.each do |src|
52
+ puts "Unpickling #{src} ..."
53
+ gzip_dump(src.sub('/src/', '/'), unpickle(src))
54
+ end
55
+ end
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,21 @@
1
+ require 'singleton'
2
+ require 'zlib'
3
+
4
+ ##
5
+ # Provides convenience methods for splitting text into sentences.
6
+ #
7
+ # @see README
8
+ ##
9
+ module Splitta
10
+
11
+ autoload :Doc, 'splitta/doc'
12
+ autoload :Frag, 'splitta/frag'
13
+ autoload :Model, 'splitta/model'
14
+ autoload :VERSION, 'splitta/version'
15
+ autoload :WordTokenizer, 'splitta/word_tokenizer'
16
+
17
+ def self.sentences(text)
18
+ Doc.new(text, model: Model.instance).segments.map(&:strip)
19
+ end
20
+
21
+ end
@@ -0,0 +1,50 @@
1
+ #
2
+ # A Document points to a collection of Frags
3
+ #
4
+ module Splitta
5
+ class Doc
6
+
7
+ FRAG_SPLITTER = /
8
+ (
9
+ [.!?] # sentence end punctuation
10
+ (?:
11
+ (?:<.*>) # extra tag
12
+ |
13
+ [”"')\]}] # right-handed punctuation to retain
14
+ )*
15
+ \s+ # must have whitespace
16
+ )
17
+ /ux
18
+
19
+ SEGMENT_THRESHOLD = 0.5
20
+
21
+ attr_reader :frags
22
+
23
+ def initialize(text, model:)
24
+ @frags = []
25
+ text.split(FRAG_SPLITTER).each_slice(2) do |frag_text|
26
+ frag = Frag.new(frag_text.join, previous_frag: @frags.last)
27
+ @frags << frag
28
+ end
29
+ model.classify(self)
30
+ end
31
+
32
+ #
33
+ # output all the text, split according to predictions
34
+ #
35
+ def segments
36
+ Enumerator.new do |y|
37
+ io = StringIO.new
38
+ frags.each do |frag|
39
+ io << frag.orig
40
+ if frag.over?(SEGMENT_THRESHOLD)
41
+ y << io.string
42
+ io.string = ''
43
+ end
44
+ end
45
+ y << io.string unless io.string.empty?
46
+ end
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,101 @@
1
+ #
2
+ # A fragment of text that ends with a possible sentence boundary
3
+ #
4
+ module Splitta
5
+ class Frag
6
+
7
+ include WordTokenizer
8
+
9
+ attr_reader :orig, :last_word, :next_word
10
+ attr_accessor :pred
11
+
12
+ def initialize(orig, previous_frag: nil)
13
+ words = clean(orig).split
14
+ previous_frag.next_word = words.first if previous_frag
15
+ @orig = orig
16
+ @last_word = words.last
17
+ end
18
+
19
+ # ... w1. (sb?) w2 ...
20
+ # Features, listed roughly in order of importance:
21
+ #
22
+ # (1) w1: word that includes a period
23
+ # (2) w2: the next word, if it exists
24
+ # (3) w1length: number of alphabetic characters in w1
25
+ # (4) w2cap: true if w2 is capitalized
26
+ # (5) both: w1 and w2
27
+ # (6) w1abbr: log count of w1 in training without a final period
28
+ # (7) w2lower: log count of w2 in training as lowercased
29
+ # (8) w1w2upper: w1 and w2 is capitalized
30
+ def features(model)
31
+ Enumerator.new do |y|
32
+ y << [:w1, w1]
33
+ y << [:w2, w2]
34
+ y << [:both, w1, w2]
35
+
36
+ if alphabetic?(w1)
37
+ y << [:w1length, w1length]
38
+ y << [:w1abbr, w1abbr(model)]
39
+ end
40
+
41
+ if alphabetic?(w2)
42
+ y << [:w2cap, w2cap]
43
+ y << [:w2lower, w2lower(model)]
44
+ y << [:w1w2upper, w1, w2cap]
45
+ end
46
+ end
47
+ end
48
+
49
+ def over?(threshold)
50
+ !!pred && pred > threshold
51
+ end
52
+
53
+ protected
54
+
55
+ attr_writer :next_word
56
+
57
+ private
58
+
59
+ # normalize numbers, discard some punctuation that can be ambiguous
60
+ def clean(text)
61
+ text = tokenize(text)
62
+ text.gsub!(/[.,\d]*\d/, '<NUM>')
63
+ text.gsub!(%r{[^a-zA-Z0-9,.;:<>\-'/?!$% ]}, '')
64
+ text.gsub!('--', ' ')
65
+ text
66
+ end
67
+
68
+ def w1
69
+ @w1 ||= last_word&.sub(/(^.+?-)/, '')
70
+ end
71
+
72
+ def w2
73
+ @w2 ||= next_word&.sub(/(-.+?)$/, '')
74
+ end
75
+
76
+ def w1length
77
+ [10, w1.sub(/\W/, '').length].min
78
+ end
79
+
80
+ def w1abbr(model)
81
+ Math.log(1 + model.non_abbrs.fetch(w1.chop, 0.0)).to_i
82
+ end
83
+
84
+ def w2cap
85
+ upcase?(w2.chars.first) ? 'True' : 'False'
86
+ end
87
+
88
+ def w2lower(model)
89
+ Math.log(1 + model.lower_words.fetch(w2.downcase, 0.0)).to_i
90
+ end
91
+
92
+ def alphabetic?(str)
93
+ !!/[a-zA-Z. ]+/u.match(str)
94
+ end
95
+
96
+ def upcase?(str)
97
+ str.upcase == str
98
+ end
99
+
100
+ end
101
+ end
@@ -0,0 +1,70 @@
1
+ #
2
+ # Naive Bayes model, with a few tweaks:
3
+ # - all feature types are pooled together for normalization (this might help
4
+ # because the independence assumption is so broken for our features)
5
+ # - smoothing: add 0.1 to all counts
6
+ # - priors are modified for better performance (this is mysterious but works much better)
7
+ #
8
+ module Splitta
9
+ class Model
10
+
11
+ include Singleton
12
+
13
+ LABELS = [0, 1]
14
+
15
+ attr_reader :feats, :lower_words, :non_abbrs, :prior_probs
16
+
17
+ def initialize
18
+ @feats = model_read(:feats)
19
+ @lower_words = model_read(:lower_words)
20
+ @non_abbrs = model_read(:non_abbrs)
21
+
22
+ @prior_probs = LABELS.each_with_object({}) do |label, h|
23
+ h[label] = feats[[label, '<prior>']]**4
24
+ end
25
+ end
26
+
27
+ def classify(doc)
28
+ doc.frags.each do |frag|
29
+ frag.pred = classify_one(frag)
30
+ end
31
+ end
32
+
33
+ def inspect
34
+ "#<Splitta::Model:#{object_id}>"
35
+ end
36
+
37
+ private
38
+
39
+ def classify_one(frag)
40
+ probs = prior_probs.dup
41
+ LABELS.each do |label|
42
+ frag.features(self).each do |f|
43
+ key = [label, f.join('_')]
44
+ next unless feats.include?(key)
45
+
46
+ probs[label] *= feats[key]
47
+ end
48
+ end
49
+ normalize(probs).fetch(LABELS.last)
50
+ end
51
+
52
+ def normalize(probs)
53
+ total = probs.values.reduce(:+).to_f
54
+ probs.transform_values do |value|
55
+ value / total
56
+ end
57
+ end
58
+
59
+ def model_read(name)
60
+ Zlib::GzipReader.open(File.join(basedir, name.to_s)) do |gz|
61
+ Marshal.load(gz)
62
+ end
63
+ end
64
+
65
+ def basedir
66
+ File.expand_path('../../data', __dir__)
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,6 @@
1
+ module Splitta
2
+
3
+ # Current gem version
4
+ VERSION = '4.2.5'
5
+
6
+ end
@@ -0,0 +1,71 @@
1
+ #
2
+ # A list of (regexp, repl) pairs applied in sequence.
3
+ # The resulting string is split on whitespace.
4
+ # (Adapted from the Punkt Word Tokenizer)
5
+ #
6
+ module Splitta
7
+ module WordTokenizer
8
+
9
+ TOKENIZE_REGEXPS = [
10
+ # uniform quotes
11
+ /
12
+ '' |
13
+ `` |
14
+ “ |
15
+
16
+ /ux, '"',
17
+
18
+ # Separate punctuation (except period) from words:
19
+ /(^|\s)(')/, '\1\2 ',
20
+ /(?<=[("`{\[:;&#*@])(.)/, ' \1', # left-hand punctuation
21
+ /(.)(?=[?!)";}\]*:@'])/, '\1 ', # right-hand punctuation
22
+ /(?<=[)}\]])(.)/, ' \1', # left-hand close paren
23
+ /(.)(?=[({\[])/, '\1 ', # right-hand open paren
24
+ /((^|\s)-)(?=[^-])/, '\1 ', # starting hyphen/minus
25
+
26
+ # Treat double-hyphen as one token:
27
+ /([^-])(--+)([^-])/, '\1 \2 \3',
28
+
29
+ # Only separate comma if space follows:
30
+ /(\s|^)(,)(?=(\S))/u, '\1\2 ',
31
+ /(.)(,)(\s|$)/u, '\1 \2\3',
32
+
33
+ # Combine dots separated by whitespace to be a single token:
34
+ /\.\s\.\s\./u, '...',
35
+
36
+ # Separate "No.6"
37
+ /([A-Za-z]\.)(\d+)/, '\1 \2',
38
+
39
+ # Separate words from ellipses
40
+ /([^.]|^)(\.{2,})(.?)/, '\1 \2 \3',
41
+ /(^|\s)(\.{2,})([^.\s])/u, '\1\2 \3',
42
+ /(^|\s)(\.{2,})([^.\s])/u, '\1 \2\3',
43
+
44
+ # fix %, $, &
45
+ /(\d)%/, '\1 %',
46
+ /\$(\.?\d)/, '$ \1',
47
+ /(\w)& (\w)/, '\1&\2',
48
+ /(\w\w+)&(\w\w+)/, '\1 & \2',
49
+
50
+ # fix (n 't) --> ( n't)
51
+ /n 't( |$)/, ' n\'t\1',
52
+ /N 'T( |$)/, ' N\'T\1',
53
+
54
+ # treebank tokenizer special words
55
+ /([Cc])annot/, '\1an not',
56
+ /\s+/, ' ',
57
+ ]
58
+
59
+ #
60
+ # Tokenize a string using the rules above
61
+ #
62
+ def tokenize(text)
63
+ text = text.dup
64
+ TOKENIZE_REGEXPS.each_slice(2) do |regexp, repl|
65
+ text.gsub!(regexp, repl)
66
+ end
67
+ text
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,27 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'splitta/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'splitta'
7
+ spec.version = Splitta::VERSION
8
+ spec.authors = ['David McCullars']
9
+ spec.email = ['david.mccullars@gmail.com']
10
+
11
+ spec.summary = 'Implementation of Splitta in Ruby'
12
+ spec.description = 'Implementation of Splitta in Ruby. See https://code.google.com/archive/p/splitta/'
13
+ spec.homepage = 'https://github.com/david-mccullars/ruby-splitta'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = 'exe'
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_development_dependency 'bundler'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'rspec'
24
+ spec.add_development_dependency 'rubocop'
25
+ spec.add_development_dependency 'rubypython'
26
+ spec.add_development_dependency 'simplecov', '~> 0.17.0' # 0.18 not supported by code climate
27
+ end
metadata ADDED
@@ -0,0 +1,151 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: splitta
3
+ version: !ruby/object:Gem::Version
4
+ version: 4.2.5
5
+ platform: ruby
6
+ authors:
7
+ - David McCullars
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-06-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rubocop
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rubypython
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.17.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.17.0
97
+ description: Implementation of Splitta in Ruby. See https://code.google.com/archive/p/splitta/
98
+ email:
99
+ - david.mccullars@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".codeclimate.yml"
105
+ - ".gitignore"
106
+ - ".rubocop.yml"
107
+ - ".ruby-version"
108
+ - ".travis.yml"
109
+ - CODE_OF_CONDUCT.md
110
+ - Gemfile
111
+ - LICENSE
112
+ - README.md
113
+ - Rakefile
114
+ - data/feats
115
+ - data/lower_words
116
+ - data/non_abbrs
117
+ - data/src/feats
118
+ - data/src/lower_words
119
+ - data/src/non_abbrs
120
+ - lib/splitta.rb
121
+ - lib/splitta/doc.rb
122
+ - lib/splitta/frag.rb
123
+ - lib/splitta/model.rb
124
+ - lib/splitta/version.rb
125
+ - lib/splitta/word_tokenizer.rb
126
+ - splitta.gemspec
127
+ homepage: https://github.com/david-mccullars/ruby-splitta
128
+ licenses:
129
+ - MIT
130
+ metadata: {}
131
+ post_install_message:
132
+ rdoc_options: []
133
+ require_paths:
134
+ - lib
135
+ required_ruby_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ requirements: []
146
+ rubyforge_project:
147
+ rubygems_version: 2.7.6
148
+ signing_key:
149
+ specification_version: 4
150
+ summary: Implementation of Splitta in Ruby
151
+ test_files: []