pragmatic_tokenizer 3.1.0 → 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0feac0e3ed363c652e85fda4dd631b6f221da44af13ef9da24e9f0be5a0cdd38
4
- data.tar.gz: adc17ceadcf5749c1c9d1abce01f28658511a9bd251e25dda2ccf333f32cd355
3
+ metadata.gz: 15de4932ae8a9e1d96e42552acb4b8247737fc0b343dab265b5e50ca6e5d9d78
4
+ data.tar.gz: 378e5853911490a38b0fbec242c7e1a95b9e8d4f3499ac59633ea7494d90a127
5
5
  SHA512:
6
- metadata.gz: 99bbf93e63cf667703c2c386e51592873f9c70fa20df823d15ef69cf3714fef98c48d2428d451ce9c1433c3b1d23c5a317a2ecf1fc5ed6f9be04a66c8d268773
7
- data.tar.gz: ddb68cee3ea6cf0b4d2bbe581c99a37d1ecb34de5a2c0703073cc53b27d6520dbf6d9bdd811f0bfcb244120c91290040f269eefeb002c3440d8c9fa55a2d9671
6
+ metadata.gz: 9ae171101502bc657e8a3c61a0fdf7f22d8aae99957ddebacc0048327882f4f3eb1435bc67b566c0550a36cc0eb818cd9a6f96c393554d2a06f1212cb86f1bd8
7
+ data.tar.gz: 7158572d7496732af4004c5684373df22786c0bdb13564c7a0865a20394d2d750191b84d42e8025c68e11eb1143c60893f92ab7bb251bbe1d9fa4918482eedb7
@@ -17,7 +17,7 @@ module PragmaticTokenizer
17
17
  private
18
18
 
19
19
  def remove_non_breaking_space!
20
- gsub!(Regex::NO_BREAK_SPACE, ''.freeze)
20
+ gsub!(Regex::NO_BREAK_SPACE, ' '.freeze)
21
21
  end
22
22
 
23
23
  def shift_various_characters!
@@ -27,6 +27,7 @@ module PragmaticTokenizer
27
27
  ASTERISK = /(?:\*+)/
28
28
  UNDERSCORE = /(?:_+)/
29
29
  HYPHEN_OR_UNDERSCORE = /(?:[-_])/
30
+ LONG_WORD_SPLIT = /(?:[-_\/—–])/
30
31
  PERIOD_AND_PRIOR = /(?:(.+\.))/
31
32
  PERIOD_ONLY = /(?:(\.))/
32
33
  CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
@@ -253,7 +253,7 @@ module PragmaticTokenizer
253
253
  return token if token.length <= @long_word_split
254
254
  return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
255
  return token if token =~ Regex::DOMAIN_OR_EMAIL
256
- token.split(Regex::HYPHEN_OR_UNDERSCORE)
256
+ token.split(Regex::LONG_WORD_SPLIT)
257
257
  end
258
258
 
259
259
  def chosen_case(text)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.1.0".freeze
2
+ VERSION = "3.2.0".freeze
3
3
  end
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
88
88
  expect(pt.tokenize(text)).to eq(["#ab-cd"])
89
89
  end
90
90
 
91
+ it 'tokenizes a string #015' do
92
+ text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
93
+ pt = PragmaticTokenizer::Tokenizer.new
94
+ expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
95
+ end
96
+
91
97
  it 'handles numbers with symbols 2' do
92
98
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
93
99
  pt = PragmaticTokenizer::Tokenizer.new
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
543
549
  )
544
550
  expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
545
551
  end
552
+ it 'tokenizes something with a slash' do
553
+ text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
554
+ pt = PragmaticTokenizer::Tokenizer.new(
555
+ long_word_split: 1
556
+ )
557
+ expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
558
+ end
546
559
  end
547
560
 
548
561
  context 'option (clean)' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-10 00:00:00.000000000 Z
11
+ date: 2020-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -153,7 +153,7 @@ files:
153
153
  homepage: https://github.com/diasks2/pragmatic_tokenizer
154
154
  licenses: []
155
155
  metadata: {}
156
- post_install_message:
156
+ post_install_message:
157
157
  rdoc_options: []
158
158
  require_paths:
159
159
  - lib
@@ -168,9 +168,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
168
  - !ruby/object:Gem::Version
169
169
  version: '0'
170
170
  requirements: []
171
- rubyforge_project:
171
+ rubyforge_project:
172
172
  rubygems_version: 2.7.6
173
- signing_key:
173
+ signing_key:
174
174
  specification_version: 4
175
175
  summary: A multilingual tokenizer
176
176
  test_files: