tokeneyes 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f648d93394449ac71d1d776d559e4b394edd08af
4
- data.tar.gz: 8ee6a9db2a1bf74b9bf317e225379a463b436012
2
+ SHA256:
3
+ metadata.gz: d3799154ab70a79b433e18bed482a94c8ba2267901ca40faf5471c8e6bb826e0
4
+ data.tar.gz: d0cdb23eff011131eb79e8618a23e82547b61000e51c3bc3a4ef12aa1eaf6b5d
5
5
  SHA512:
6
- metadata.gz: a94e9a12e5c9c301b791588593e858fc4ddb389fbe98b4e5dc80489819b0447ba2de289019a426c6075aea9c803f243269bcfcea3a440b18ff209f99c30a6f08
7
- data.tar.gz: cecaa693773ea68d7211e82ff0c1a59fdd1fc875ab64bc98709d4acc1b6946ee01e5348d7bd7931da9853d0b6f303be6fe00b1bdc424c20174d359e905b93199
6
+ metadata.gz: 24c6dc4315eaa00a4a8da5759e3227fc67a0ed476e2d47f2ed468acb9b24afa70fb3a871785870b48d5e3fffbd9c01624ef837d14ff7040d040bd4444f70627e
7
+ data.tar.gz: d3b48c2ed9b9b5d2f16f31ff90b37b282540813ceed4aa8a1db550627f9b478d02abb2e64494c3b2677f293d5026e808fa6966fd7c2b7eb9ed9fe1a37ff6e8fa
data/.gitignore CHANGED
@@ -7,5 +7,3 @@
7
7
  /pkg/
8
8
  /spec/reports/
9
9
  /tmp/
10
- .ruby-version
11
- .ruby-gemset
@@ -0,0 +1 @@
1
+ tokeneyes
@@ -0,0 +1 @@
1
+ 2.7
@@ -1,5 +1,6 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.2.3
4
- - jruby-9000
5
- before_install: gem install bundler -v 1.10.6
3
+ - 2.5
4
+ - 2.6
5
+ - 2.7
6
+ before_install: gem install bundler -v 2.1.4
data/Gemfile CHANGED
@@ -3,6 +3,6 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in tokeneyes.gemspec
4
4
  gemspec
5
5
 
6
- group :test do
7
- gem "codeclimate-test-reporter", require: nil
6
+ group :development do
7
+ gem "pry"
8
8
  end
data/README.md CHANGED
@@ -1,3 +1,7 @@
1
+ [![Code Climate](https://codeclimate.com/github/arsduo/tokeneyes/badges/gpa.svg)](https://codeclimate.com/github/arsduo/tokeneyes)
2
+ [![Test Coverage](https://codeclimate.com/github/arsduo/tokeneyes/badges/coverage.svg)](https://codeclimate.com/github/arsduo/tokeneyes/coverage)
3
+ [![Build Status](https://travis-ci.org/arsduo/tokeneyes.svg)](https://travis-ci.org/arsduo/tokeneyes)
4
+
1
5
  # Tokeneyes
2
6
 
3
7
  A string tokenizer designed to capture words with associated punctuation and sentence flow
@@ -1,3 +1,8 @@
1
+ v0.1.1
2
+ ======
3
+
4
+ * Use sets instead of regular expressions to check character types, cutting processing time by >50%
5
+
1
6
  v0.1.0
2
7
  ======
3
8
 
@@ -1,3 +1,3 @@
1
1
  module Tokeneyes
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -12,18 +12,28 @@ module Tokeneyes
12
12
  end
13
13
 
14
14
  # Definite word elements, those that can repeat as much as they want and always be words:
15
- # alphanumeric characters (including European symbols, all the Unicode blocks). If anyone has expertise on non-European
15
+ # alphanumeric characters (including some European symbols). If anyone has expertise on non-European
16
16
  # languages, I would love to add support for other character groups.
17
- # We include @ and # to support Twitter mentions, hashtags, and email addresses.
18
- WORD_ELEMENTS = /[\w\d\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\@\#]/
17
+ WORD_ELEMENTS = Set.new(
18
+ # Letters
19
+ ("A".."Z").to_a + ("a".."z").to_a +
20
+ # Numbers
21
+ ("0".."9").to_a +
22
+ # A subset of European characters
23
+ ("\u00C0".."\uD7FF").to_a + ("\u00D8".."\u00F6").to_a + ("\u00F8".."\u00FC").to_a +
24
+ # Hashtag, @mention, and email support -- this will need to be made more intelligent later
25
+ ["@", "#"]
26
+ )
27
+
19
28
  # Defines a word boundary that also ends a unit of text.
20
- SENTENCE_BOUNDARY = /[\.;\?\!]/
29
+ SENTENCE_BOUNDARY = Set.new([".", ";", "?", "!"])
21
30
  # Possible word elements, those that mark a word boundary unless they're followed by a word
22
31
  # element:
23
- POSSIBLE_WORD_ELEMENTS = /[\.'\-]/
32
+ POSSIBLE_WORD_ELEMENTS = Set.new([".", "'", "-"])
24
33
  # We don't track all possible punctuation, just some. (In particular, we don't track those that
25
34
  # come in pairs, like parentheses and brackets, etc.)
26
- MEANINGFUL_PUNCTUATION = /[\.,\-;\!\?]/
35
+ # TODO add support for ellipses, interrobang, etc.
36
+ MEANINGFUL_PUNCTUATION = Set.new([".", ",", "-", ";", "!", "?"])
27
37
  # Everything else represents a word boundary.
28
38
 
29
39
  def word_finished?
@@ -50,11 +60,11 @@ module Tokeneyes
50
60
  # Which punctuation ended the word?
51
61
  def punctuation
52
62
  return nil unless word_finished?
53
- punctuation_candidate if punctuation_candidate.match(MEANINGFUL_PUNCTUATION)
63
+ punctuation_candidate if MEANINGFUL_PUNCTUATION.include?(punctuation_candidate)
54
64
  end
55
65
 
56
66
  def sentence_ended?
57
- !!(punctuation && punctuation.match(SENTENCE_BOUNDARY))
67
+ !!(punctuation && SENTENCE_BOUNDARY.include?(punctuation))
58
68
  end
59
69
 
60
70
  protected
@@ -69,18 +79,18 @@ module Tokeneyes
69
79
  end
70
80
 
71
81
  def current_char_is_word_element?
72
- current_char.match(WORD_ELEMENTS)
82
+ WORD_ELEMENTS.include?(current_char)
73
83
  end
74
84
 
75
85
  def previous_character_was_possible_boundary?
76
86
  # it's not a possible word boundary if the word hasn't yet started
77
- previous_char.match(POSSIBLE_WORD_ELEMENTS) && word_so_far.length > 0
87
+ POSSIBLE_WORD_ELEMENTS.include?(previous_char) && word_so_far.length > 0
78
88
  end
79
89
 
80
90
  def current_char_is_possible_boundary?
81
91
  # If the previous character was also a boundary, this one can't be as well -- we've ended the
82
92
  # word.
83
- current_char.match(POSSIBLE_WORD_ELEMENTS) && !previous_character_was_possible_boundary?
93
+ POSSIBLE_WORD_ELEMENTS.include?(current_char) && !previous_character_was_possible_boundary?
84
94
  end
85
95
 
86
96
  def punctuation_candidate
@@ -19,8 +19,7 @@ Gem::Specification.new do |spec|
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
20
  spec.require_paths = ["lib"]
21
21
 
22
- spec.add_development_dependency "bundler", "~> 1.10"
23
- spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "rake", "~> 12.0"
24
23
  spec.add_development_dependency "rspec", "~> 3.3"
25
24
  spec.add_development_dependency "faker"
26
25
  end
metadata CHANGED
@@ -1,43 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokeneyes
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Koppel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-28 00:00:00.000000000 Z
11
+ date: 2020-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: bundler
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '1.10'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '1.10'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: rake
29
15
  requirement: !ruby/object:Gem::Requirement
30
16
  requirements:
31
17
  - - "~>"
32
18
  - !ruby/object:Gem::Version
33
- version: '10.0'
19
+ version: '12.0'
34
20
  type: :development
35
21
  prerelease: false
36
22
  version_requirements: !ruby/object:Gem::Requirement
37
23
  requirements:
38
24
  - - "~>"
39
25
  - !ruby/object:Gem::Version
40
- version: '10.0'
26
+ version: '12.0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: rspec
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -75,6 +61,8 @@ extensions: []
75
61
  extra_rdoc_files: []
76
62
  files:
77
63
  - ".gitignore"
64
+ - ".ruby-gemset"
65
+ - ".ruby-version"
78
66
  - ".travis.yml"
79
67
  - CODE_OF_CONDUCT.md
80
68
  - Gemfile
@@ -110,8 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
98
  - !ruby/object:Gem::Version
111
99
  version: '0'
112
100
  requirements: []
113
- rubyforge_project:
114
- rubygems_version: 2.4.5.1
101
+ rubygems_version: 3.1.0.pre1
115
102
  signing_key:
116
103
  specification_version: 4
117
104
  summary: A simple string tokenizer designed to capture punctuation and sentence flow