tokeneyes 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f648d93394449ac71d1d776d559e4b394edd08af
4
- data.tar.gz: 8ee6a9db2a1bf74b9bf317e225379a463b436012
2
+ SHA256:
3
+ metadata.gz: d3799154ab70a79b433e18bed482a94c8ba2267901ca40faf5471c8e6bb826e0
4
+ data.tar.gz: d0cdb23eff011131eb79e8618a23e82547b61000e51c3bc3a4ef12aa1eaf6b5d
5
5
  SHA512:
6
- metadata.gz: a94e9a12e5c9c301b791588593e858fc4ddb389fbe98b4e5dc80489819b0447ba2de289019a426c6075aea9c803f243269bcfcea3a440b18ff209f99c30a6f08
7
- data.tar.gz: cecaa693773ea68d7211e82ff0c1a59fdd1fc875ab64bc98709d4acc1b6946ee01e5348d7bd7931da9853d0b6f303be6fe00b1bdc424c20174d359e905b93199
6
+ metadata.gz: 24c6dc4315eaa00a4a8da5759e3227fc67a0ed476e2d47f2ed468acb9b24afa70fb3a871785870b48d5e3fffbd9c01624ef837d14ff7040d040bd4444f70627e
7
+ data.tar.gz: d3b48c2ed9b9b5d2f16f31ff90b37b282540813ceed4aa8a1db550627f9b478d02abb2e64494c3b2677f293d5026e808fa6966fd7c2b7eb9ed9fe1a37ff6e8fa
data/.gitignore CHANGED
@@ -7,5 +7,3 @@
7
7
  /pkg/
8
8
  /spec/reports/
9
9
  /tmp/
10
- .ruby-version
11
- .ruby-gemset
@@ -0,0 +1 @@
1
+ tokeneyes
@@ -0,0 +1 @@
1
+ 2.7
@@ -1,5 +1,6 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.2.3
4
- - jruby-9000
5
- before_install: gem install bundler -v 1.10.6
3
+ - 2.5
4
+ - 2.6
5
+ - 2.7
6
+ before_install: gem install bundler -v 2.1.4
data/Gemfile CHANGED
@@ -3,6 +3,6 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in tokeneyes.gemspec
4
4
  gemspec
5
5
 
6
- group :test do
7
- gem "codeclimate-test-reporter", require: nil
6
+ group :development do
7
+ gem "pry"
8
8
  end
data/README.md CHANGED
@@ -1,3 +1,7 @@
1
+ [![Code Climate](https://codeclimate.com/github/arsduo/tokeneyes/badges/gpa.svg)](https://codeclimate.com/github/arsduo/tokeneyes)
2
+ [![Test Coverage](https://codeclimate.com/github/arsduo/tokeneyes/badges/coverage.svg)](https://codeclimate.com/github/arsduo/tokeneyes/coverage)
3
+ [![Build Status](https://travis-ci.org/arsduo/tokeneyes.svg)](https://travis-ci.org/arsduo/tokeneyes)
4
+
1
5
  # Tokeneyes
2
6
 
3
7
  A string tokenizer designed to capture words with associated punctuation and sentence flow
@@ -1,3 +1,8 @@
1
+ v0.1.1
2
+ ======
3
+
4
+ * Use sets instead of regular expressions to check character types, cutting processing time by >50%
5
+
1
6
  v0.1.0
2
7
  ======
3
8
 
@@ -1,3 +1,3 @@
1
1
  module Tokeneyes
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -12,18 +12,28 @@ module Tokeneyes
12
12
  end
13
13
 
14
14
  # Definite word elements, those that can repeat as much as they want and always be words:
15
- # alphanumeric characters (including European symbols, all the Unicode blocks). If anyone has expertise on non-European
15
+ # alphanumeric characters (including some European symbols). If anyone has expertise on non-European
16
16
  # languages, I would love to add support for other character groups.
17
- # We include @ and # to support Twitter mentions, hashtags, and email addresses.
18
- WORD_ELEMENTS = /[\w\d\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\@\#]/
17
+ WORD_ELEMENTS = Set.new(
18
+ # Letters
19
+ ("A".."Z").to_a + ("a".."z").to_a +
20
+ # Numbers
21
+ ("0".."9").to_a +
22
+ # A subset of European characters
23
+ ("\u00C0".."\uD7FF").to_a + ("\u00D8".."\u00F6").to_a + ("\u00F8".."\u00FC").to_a +
24
+ # Hashtag, @mention, and email support -- this will need to be made more intelligent later
25
+ ["@", "#"]
26
+ )
27
+
19
28
  # Defines a word boundary that also ends a unit of text.
20
- SENTENCE_BOUNDARY = /[\.;\?\!]/
29
+ SENTENCE_BOUNDARY = Set.new([".", ";", "?", "!"])
21
30
  # Possible word elements, those that mark a word boundary unless they're followed by a word
22
31
  # element:
23
- POSSIBLE_WORD_ELEMENTS = /[\.'\-]/
32
+ POSSIBLE_WORD_ELEMENTS = Set.new([".", "'", "-"])
24
33
  # We don't track all possible punctuation, just some. (In particular, we don't track those that
25
34
  # come in pairs, like parentheses and brackets, etc.)
26
- MEANINGFUL_PUNCTUATION = /[\.,\-;\!\?]/
35
+ # TODO add support for ellipses, interrobang, etc.
36
+ MEANINGFUL_PUNCTUATION = Set.new([".", ",", "-", ";", "!", "?"])
27
37
  # Everything else represents a word boundary.
28
38
 
29
39
  def word_finished?
@@ -50,11 +60,11 @@ module Tokeneyes
50
60
  # Which punctuation ended the word?
51
61
  def punctuation
52
62
  return nil unless word_finished?
53
- punctuation_candidate if punctuation_candidate.match(MEANINGFUL_PUNCTUATION)
63
+ punctuation_candidate if MEANINGFUL_PUNCTUATION.include?(punctuation_candidate)
54
64
  end
55
65
 
56
66
  def sentence_ended?
57
- !!(punctuation && punctuation.match(SENTENCE_BOUNDARY))
67
+ !!(punctuation && SENTENCE_BOUNDARY.include?(punctuation))
58
68
  end
59
69
 
60
70
  protected
@@ -69,18 +79,18 @@ module Tokeneyes
69
79
  end
70
80
 
71
81
  def current_char_is_word_element?
72
- current_char.match(WORD_ELEMENTS)
82
+ WORD_ELEMENTS.include?(current_char)
73
83
  end
74
84
 
75
85
  def previous_character_was_possible_boundary?
76
86
  # it's not a possible word boundary if the word hasn't yet started
77
- previous_char.match(POSSIBLE_WORD_ELEMENTS) && word_so_far.length > 0
87
+ POSSIBLE_WORD_ELEMENTS.include?(previous_char) && word_so_far.length > 0
78
88
  end
79
89
 
80
90
  def current_char_is_possible_boundary?
81
91
  # If the previous character was also a boundary, this one can't be as well -- we've ended the
82
92
  # word.
83
- current_char.match(POSSIBLE_WORD_ELEMENTS) && !previous_character_was_possible_boundary?
93
+ POSSIBLE_WORD_ELEMENTS.include?(current_char) && !previous_character_was_possible_boundary?
84
94
  end
85
95
 
86
96
  def punctuation_candidate
@@ -19,8 +19,7 @@ Gem::Specification.new do |spec|
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
20
  spec.require_paths = ["lib"]
21
21
 
22
- spec.add_development_dependency "bundler", "~> 1.10"
23
- spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "rake", "~> 12.0"
24
23
  spec.add_development_dependency "rspec", "~> 3.3"
25
24
  spec.add_development_dependency "faker"
26
25
  end
metadata CHANGED
@@ -1,43 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokeneyes
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Koppel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-28 00:00:00.000000000 Z
11
+ date: 2020-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: bundler
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '1.10'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '1.10'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: rake
29
15
  requirement: !ruby/object:Gem::Requirement
30
16
  requirements:
31
17
  - - "~>"
32
18
  - !ruby/object:Gem::Version
33
- version: '10.0'
19
+ version: '12.0'
34
20
  type: :development
35
21
  prerelease: false
36
22
  version_requirements: !ruby/object:Gem::Requirement
37
23
  requirements:
38
24
  - - "~>"
39
25
  - !ruby/object:Gem::Version
40
- version: '10.0'
26
+ version: '12.0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: rspec
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -75,6 +61,8 @@ extensions: []
75
61
  extra_rdoc_files: []
76
62
  files:
77
63
  - ".gitignore"
64
+ - ".ruby-gemset"
65
+ - ".ruby-version"
78
66
  - ".travis.yml"
79
67
  - CODE_OF_CONDUCT.md
80
68
  - Gemfile
@@ -110,8 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
98
  - !ruby/object:Gem::Version
111
99
  version: '0'
112
100
  requirements: []
113
- rubyforge_project:
114
- rubygems_version: 2.4.5.1
101
+ rubygems_version: 3.1.0.pre1
115
102
  signing_key:
116
103
  specification_version: 4
117
104
  summary: A simple string tokenizer designed to capture punctuation and sentence flow