tokeneyes 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +0 -2
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -3
- data/Gemfile +2 -2
- data/README.md +4 -0
- data/changelog.md +5 -0
- data/lib/tokeneyes/version.rb +1 -1
- data/lib/tokeneyes/word_builder.rb +21 -11
- data/tokeneyes.gemspec +1 -2
- metadata +7 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d3799154ab70a79b433e18bed482a94c8ba2267901ca40faf5471c8e6bb826e0
|
4
|
+
data.tar.gz: d0cdb23eff011131eb79e8618a23e82547b61000e51c3bc3a4ef12aa1eaf6b5d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 24c6dc4315eaa00a4a8da5759e3227fc67a0ed476e2d47f2ed468acb9b24afa70fb3a871785870b48d5e3fffbd9c01624ef837d14ff7040d040bd4444f70627e
|
7
|
+
data.tar.gz: d3b48c2ed9b9b5d2f16f31ff90b37b282540813ceed4aa8a1db550627f9b478d02abb2e64494c3b2677f293d5026e808fa6966fd7c2b7eb9ed9fe1a37ff6e8fa
|
data/.gitignore
CHANGED
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
tokeneyes
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.7
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
[![Code Climate](https://codeclimate.com/github/arsduo/tokeneyes/badges/gpa.svg)](https://codeclimate.com/github/arsduo/tokeneyes)
|
2
|
+
[![Test Coverage](https://codeclimate.com/github/arsduo/tokeneyes/badges/coverage.svg)](https://codeclimate.com/github/arsduo/tokeneyes/coverage)
|
3
|
+
[![Build Status](https://travis-ci.org/arsduo/tokeneyes.svg)](https://travis-ci.org/arsduo/tokeneyes)
|
4
|
+
|
1
5
|
# Tokeneyes
|
2
6
|
|
3
7
|
A string tokenizer designed to capture words with associated punctuation and sentence flow
|
data/changelog.md
CHANGED
data/lib/tokeneyes/version.rb
CHANGED
@@ -12,18 +12,28 @@ module Tokeneyes
|
|
12
12
|
end
|
13
13
|
|
14
14
|
# Definite word elements, those that can repeat as much as they want and always be words:
|
15
|
-
# alphanumeric characters (including European symbols
|
15
|
+
# alphanumeric characters (including some European symbols). If anyone has expertise on non-European
|
16
16
|
# languages, I would love to add support for other character groups.
|
17
|
-
|
18
|
-
|
17
|
+
WORD_ELEMENTS = Set.new(
|
18
|
+
# Letters
|
19
|
+
("A".."Z").to_a + ("a".."z").to_a +
|
20
|
+
# Numbers
|
21
|
+
("0".."9").to_a +
|
22
|
+
# A subset of European characters
|
23
|
+
("\u00C0".."\uD7FF").to_a + ("\u00D8".."\u00F6").to_a + ("\u00F8".."\u00FC").to_a +
|
24
|
+
# Hashtag, @mention, and email support -- this will need to be made more intelligent later
|
25
|
+
["@", "#"]
|
26
|
+
)
|
27
|
+
|
19
28
|
# Defines a word boundary that also ends a unit of text.
|
20
|
-
SENTENCE_BOUNDARY =
|
29
|
+
SENTENCE_BOUNDARY = Set.new([".", ";", "?", "!"])
|
21
30
|
# Possible word elements, those that mark a word boundary unless they're followed by a word
|
22
31
|
# element:
|
23
|
-
POSSIBLE_WORD_ELEMENTS =
|
32
|
+
POSSIBLE_WORD_ELEMENTS = Set.new([".", "'", "-"])
|
24
33
|
# We don't track all possible punctuation, just some. (In particular, we don't track those that
|
25
34
|
# come in pairs, like parentheses and brackets, etc.)
|
26
|
-
|
35
|
+
# TODO add support for ellipses, interrobang, etc.
|
36
|
+
MEANINGFUL_PUNCTUATION = Set.new([".", ",", "-", ";", "!", "?"])
|
27
37
|
# Everything else represents a word boundary.
|
28
38
|
|
29
39
|
def word_finished?
|
@@ -50,11 +60,11 @@ module Tokeneyes
|
|
50
60
|
# Which punctuation ended the word?
|
51
61
|
def punctuation
|
52
62
|
return nil unless word_finished?
|
53
|
-
punctuation_candidate if
|
63
|
+
punctuation_candidate if MEANINGFUL_PUNCTUATION.include?(punctuation_candidate)
|
54
64
|
end
|
55
65
|
|
56
66
|
def sentence_ended?
|
57
|
-
!!(punctuation &&
|
67
|
+
!!(punctuation && SENTENCE_BOUNDARY.include?(punctuation))
|
58
68
|
end
|
59
69
|
|
60
70
|
protected
|
@@ -69,18 +79,18 @@ module Tokeneyes
|
|
69
79
|
end
|
70
80
|
|
71
81
|
def current_char_is_word_element?
|
72
|
-
|
82
|
+
WORD_ELEMENTS.include?(current_char)
|
73
83
|
end
|
74
84
|
|
75
85
|
def previous_character_was_possible_boundary?
|
76
86
|
# it's not a possible word boundary if the word hasn't yet started
|
77
|
-
|
87
|
+
POSSIBLE_WORD_ELEMENTS.include?(previous_char) && word_so_far.length > 0
|
78
88
|
end
|
79
89
|
|
80
90
|
def current_char_is_possible_boundary?
|
81
91
|
# If the previous character was also a boundary, this one can't be as well -- we've ended the
|
82
92
|
# word.
|
83
|
-
|
93
|
+
POSSIBLE_WORD_ELEMENTS.include?(current_char) && !previous_character_was_possible_boundary?
|
84
94
|
end
|
85
95
|
|
86
96
|
def punctuation_candidate
|
data/tokeneyes.gemspec
CHANGED
@@ -19,8 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
21
|
|
22
|
-
spec.add_development_dependency "
|
23
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
22
|
+
spec.add_development_dependency "rake", "~> 12.0"
|
24
23
|
spec.add_development_dependency "rspec", "~> 3.3"
|
25
24
|
spec.add_development_dependency "faker"
|
26
25
|
end
|
metadata
CHANGED
@@ -1,43 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokeneyes
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Koppel
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.10'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '1.10'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: rake
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
30
16
|
requirements:
|
31
17
|
- - "~>"
|
32
18
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
19
|
+
version: '12.0'
|
34
20
|
type: :development
|
35
21
|
prerelease: false
|
36
22
|
version_requirements: !ruby/object:Gem::Requirement
|
37
23
|
requirements:
|
38
24
|
- - "~>"
|
39
25
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
26
|
+
version: '12.0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: rspec
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -75,6 +61,8 @@ extensions: []
|
|
75
61
|
extra_rdoc_files: []
|
76
62
|
files:
|
77
63
|
- ".gitignore"
|
64
|
+
- ".ruby-gemset"
|
65
|
+
- ".ruby-version"
|
78
66
|
- ".travis.yml"
|
79
67
|
- CODE_OF_CONDUCT.md
|
80
68
|
- Gemfile
|
@@ -110,8 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
110
98
|
- !ruby/object:Gem::Version
|
111
99
|
version: '0'
|
112
100
|
requirements: []
|
113
|
-
|
114
|
-
rubygems_version: 2.4.5.1
|
101
|
+
rubygems_version: 3.1.0.pre1
|
115
102
|
signing_key:
|
116
103
|
specification_version: 4
|
117
104
|
summary: A simple string tokenizer designed to capture punctuation and sentence flow
|