tokeneyes 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +0 -2
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -3
- data/Gemfile +2 -2
- data/README.md +4 -0
- data/changelog.md +5 -0
- data/lib/tokeneyes/version.rb +1 -1
- data/lib/tokeneyes/word_builder.rb +21 -11
- data/tokeneyes.gemspec +1 -2
- metadata +7 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d3799154ab70a79b433e18bed482a94c8ba2267901ca40faf5471c8e6bb826e0
|
4
|
+
data.tar.gz: d0cdb23eff011131eb79e8618a23e82547b61000e51c3bc3a4ef12aa1eaf6b5d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 24c6dc4315eaa00a4a8da5759e3227fc67a0ed476e2d47f2ed468acb9b24afa70fb3a871785870b48d5e3fffbd9c01624ef837d14ff7040d040bd4444f70627e
|
7
|
+
data.tar.gz: d3b48c2ed9b9b5d2f16f31ff90b37b282540813ceed4aa8a1db550627f9b478d02abb2e64494c3b2677f293d5026e808fa6966fd7c2b7eb9ed9fe1a37ff6e8fa
|
data/.gitignore
CHANGED
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
tokeneyes
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.7
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
[](https://codeclimate.com/github/arsduo/tokeneyes)
|
2
|
+
[](https://codeclimate.com/github/arsduo/tokeneyes/coverage)
|
3
|
+
[](https://travis-ci.org/arsduo/tokeneyes)
|
4
|
+
|
1
5
|
# Tokeneyes
|
2
6
|
|
3
7
|
A string tokenizer designed to capture words with associated punctuation and sentence flow
|
data/changelog.md
CHANGED
data/lib/tokeneyes/version.rb
CHANGED
@@ -12,18 +12,28 @@ module Tokeneyes
|
|
12
12
|
end
|
13
13
|
|
14
14
|
# Definite word elements, those that can repeat as much as they want and always be words:
|
15
|
-
# alphanumeric characters (including European symbols
|
15
|
+
# alphanumeric characters (including some European symbols). If anyone has expertise on non-European
|
16
16
|
# languages, I would love to add support for other character groups.
|
17
|
-
|
18
|
-
|
17
|
+
WORD_ELEMENTS = Set.new(
|
18
|
+
# Letters
|
19
|
+
("A".."Z").to_a + ("a".."z").to_a +
|
20
|
+
# Numbers
|
21
|
+
("0".."9").to_a +
|
22
|
+
# A subset of European characters
|
23
|
+
("\u00C0".."\uD7FF").to_a + ("\u00D8".."\u00F6").to_a + ("\u00F8".."\u00FC").to_a +
|
24
|
+
# Hashtag, @mention, and email support -- this will need to be made more intelligent later
|
25
|
+
["@", "#"]
|
26
|
+
)
|
27
|
+
|
19
28
|
# Defines a word boundary that also ends a unit of text.
|
20
|
-
SENTENCE_BOUNDARY =
|
29
|
+
SENTENCE_BOUNDARY = Set.new([".", ";", "?", "!"])
|
21
30
|
# Possible word elements, those that mark a word boundary unless they're followed by a word
|
22
31
|
# element:
|
23
|
-
POSSIBLE_WORD_ELEMENTS =
|
32
|
+
POSSIBLE_WORD_ELEMENTS = Set.new([".", "'", "-"])
|
24
33
|
# We don't track all possible punctuation, just some. (In particular, we don't track those that
|
25
34
|
# come in pairs, like parentheses and brackets, etc.)
|
26
|
-
|
35
|
+
# TODO add support for ellipses, interrobang, etc.
|
36
|
+
MEANINGFUL_PUNCTUATION = Set.new([".", ",", "-", ";", "!", "?"])
|
27
37
|
# Everything else represents a word boundary.
|
28
38
|
|
29
39
|
def word_finished?
|
@@ -50,11 +60,11 @@ module Tokeneyes
|
|
50
60
|
# Which punctuation ended the word?
|
51
61
|
def punctuation
|
52
62
|
return nil unless word_finished?
|
53
|
-
punctuation_candidate if
|
63
|
+
punctuation_candidate if MEANINGFUL_PUNCTUATION.include?(punctuation_candidate)
|
54
64
|
end
|
55
65
|
|
56
66
|
def sentence_ended?
|
57
|
-
!!(punctuation &&
|
67
|
+
!!(punctuation && SENTENCE_BOUNDARY.include?(punctuation))
|
58
68
|
end
|
59
69
|
|
60
70
|
protected
|
@@ -69,18 +79,18 @@ module Tokeneyes
|
|
69
79
|
end
|
70
80
|
|
71
81
|
def current_char_is_word_element?
|
72
|
-
|
82
|
+
WORD_ELEMENTS.include?(current_char)
|
73
83
|
end
|
74
84
|
|
75
85
|
def previous_character_was_possible_boundary?
|
76
86
|
# it's not a possible word boundary if the word hasn't yet started
|
77
|
-
|
87
|
+
POSSIBLE_WORD_ELEMENTS.include?(previous_char) && word_so_far.length > 0
|
78
88
|
end
|
79
89
|
|
80
90
|
def current_char_is_possible_boundary?
|
81
91
|
# If the previous character was also a boundary, this one can't be as well -- we've ended the
|
82
92
|
# word.
|
83
|
-
|
93
|
+
POSSIBLE_WORD_ELEMENTS.include?(current_char) && !previous_character_was_possible_boundary?
|
84
94
|
end
|
85
95
|
|
86
96
|
def punctuation_candidate
|
data/tokeneyes.gemspec
CHANGED
@@ -19,8 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
21
|
|
22
|
-
spec.add_development_dependency "
|
23
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
22
|
+
spec.add_development_dependency "rake", "~> 12.0"
|
24
23
|
spec.add_development_dependency "rspec", "~> 3.3"
|
25
24
|
spec.add_development_dependency "faker"
|
26
25
|
end
|
metadata
CHANGED
@@ -1,43 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokeneyes
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Koppel
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.10'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '1.10'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: rake
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
30
16
|
requirements:
|
31
17
|
- - "~>"
|
32
18
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
19
|
+
version: '12.0'
|
34
20
|
type: :development
|
35
21
|
prerelease: false
|
36
22
|
version_requirements: !ruby/object:Gem::Requirement
|
37
23
|
requirements:
|
38
24
|
- - "~>"
|
39
25
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
26
|
+
version: '12.0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: rspec
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -75,6 +61,8 @@ extensions: []
|
|
75
61
|
extra_rdoc_files: []
|
76
62
|
files:
|
77
63
|
- ".gitignore"
|
64
|
+
- ".ruby-gemset"
|
65
|
+
- ".ruby-version"
|
78
66
|
- ".travis.yml"
|
79
67
|
- CODE_OF_CONDUCT.md
|
80
68
|
- Gemfile
|
@@ -110,8 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
110
98
|
- !ruby/object:Gem::Version
|
111
99
|
version: '0'
|
112
100
|
requirements: []
|
113
|
-
|
114
|
-
rubygems_version: 2.4.5.1
|
101
|
+
rubygems_version: 3.1.0.pre1
|
115
102
|
signing_key:
|
116
103
|
specification_version: 4
|
117
104
|
summary: A simple string tokenizer designed to capture punctuation and sentence flow
|