pragmatic_tokenizer 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0feac0e3ed363c652e85fda4dd631b6f221da44af13ef9da24e9f0be5a0cdd38
4
- data.tar.gz: adc17ceadcf5749c1c9d1abce01f28658511a9bd251e25dda2ccf333f32cd355
3
+ metadata.gz: 15de4932ae8a9e1d96e42552acb4b8247737fc0b343dab265b5e50ca6e5d9d78
4
+ data.tar.gz: 378e5853911490a38b0fbec242c7e1a95b9e8d4f3499ac59633ea7494d90a127
5
5
  SHA512:
6
- metadata.gz: 99bbf93e63cf667703c2c386e51592873f9c70fa20df823d15ef69cf3714fef98c48d2428d451ce9c1433c3b1d23c5a317a2ecf1fc5ed6f9be04a66c8d268773
7
- data.tar.gz: ddb68cee3ea6cf0b4d2bbe581c99a37d1ecb34de5a2c0703073cc53b27d6520dbf6d9bdd811f0bfcb244120c91290040f269eefeb002c3440d8c9fa55a2d9671
6
+ metadata.gz: 9ae171101502bc657e8a3c61a0fdf7f22d8aae99957ddebacc0048327882f4f3eb1435bc67b566c0550a36cc0eb818cd9a6f96c393554d2a06f1212cb86f1bd8
7
+ data.tar.gz: 7158572d7496732af4004c5684373df22786c0bdb13564c7a0865a20394d2d750191b84d42e8025c68e11eb1143c60893f92ab7bb251bbe1d9fa4918482eedb7
@@ -17,7 +17,7 @@ module PragmaticTokenizer
17
17
  private
18
18
 
19
19
  def remove_non_breaking_space!
20
- gsub!(Regex::NO_BREAK_SPACE, ''.freeze)
20
+ gsub!(Regex::NO_BREAK_SPACE, ' '.freeze)
21
21
  end
22
22
 
23
23
  def shift_various_characters!
@@ -27,6 +27,7 @@ module PragmaticTokenizer
27
27
  ASTERISK = /(?:\*+)/
28
28
  UNDERSCORE = /(?:_+)/
29
29
  HYPHEN_OR_UNDERSCORE = /(?:[-_])/
30
+ LONG_WORD_SPLIT = /(?:[-_\/—–])/
30
31
  PERIOD_AND_PRIOR = /(?:(.+\.))/
31
32
  PERIOD_ONLY = /(?:(\.))/
32
33
  CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
@@ -253,7 +253,7 @@ module PragmaticTokenizer
253
253
  return token if token.length <= @long_word_split
254
254
  return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
255
  return token if token =~ Regex::DOMAIN_OR_EMAIL
256
- token.split(Regex::HYPHEN_OR_UNDERSCORE)
256
+ token.split(Regex::LONG_WORD_SPLIT)
257
257
  end
258
258
 
259
259
  def chosen_case(text)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.1.0".freeze
2
+ VERSION = "3.2.0".freeze
3
3
  end
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
88
88
  expect(pt.tokenize(text)).to eq(["#ab-cd"])
89
89
  end
90
90
 
91
+ it 'tokenizes a string #015' do
92
+ text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
93
+ pt = PragmaticTokenizer::Tokenizer.new
94
+ expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
95
+ end
96
+
91
97
  it 'handles numbers with symbols 2' do
92
98
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
93
99
  pt = PragmaticTokenizer::Tokenizer.new
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
543
549
  )
544
550
  expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
545
551
  end
552
+ it 'tokenizes something with a slash' do
553
+ text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
554
+ pt = PragmaticTokenizer::Tokenizer.new(
555
+ long_word_split: 1
556
+ )
557
+ expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
558
+ end
546
559
  end
547
560
 
548
561
  context 'option (clean)' do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-10 00:00:00.000000000 Z
11
+ date: 2020-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -153,7 +153,7 @@ files:
153
153
  homepage: https://github.com/diasks2/pragmatic_tokenizer
154
154
  licenses: []
155
155
  metadata: {}
156
- post_install_message:
156
+ post_install_message:
157
157
  rdoc_options: []
158
158
  require_paths:
159
159
  - lib
@@ -168,9 +168,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
168
  - !ruby/object:Gem::Version
169
169
  version: '0'
170
170
  requirements: []
171
- rubyforge_project:
171
+ rubyforge_project:
172
172
  rubygems_version: 2.7.6
173
- signing_key:
173
+ signing_key:
174
174
  specification_version: 4
175
175
  summary: A multilingual tokenizer
176
176
  test_files: