pragmatic_tokenizer 3.1.0 → 3.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0feac0e3ed363c652e85fda4dd631b6f221da44af13ef9da24e9f0be5a0cdd38
4
- data.tar.gz: adc17ceadcf5749c1c9d1abce01f28658511a9bd251e25dda2ccf333f32cd355
3
+ metadata.gz: e65a28fbe1b06c5aaed25f8fc5bce8b46b2a8dbcaeddee07139ae733fe9ed51a
4
+ data.tar.gz: 767847d1d88d74c645763fcc569060e4e3c5beeb8e41364f437562eaeb796f08
5
5
  SHA512:
6
- metadata.gz: 99bbf93e63cf667703c2c386e51592873f9c70fa20df823d15ef69cf3714fef98c48d2428d451ce9c1433c3b1d23c5a317a2ecf1fc5ed6f9be04a66c8d268773
7
- data.tar.gz: ddb68cee3ea6cf0b4d2bbe581c99a37d1ecb34de5a2c0703073cc53b27d6520dbf6d9bdd811f0bfcb244120c91290040f269eefeb002c3440d8c9fa55a2d9671
6
+ metadata.gz: 1a88cf9354785dbd50890a088f0251eac8ea817c915b46afdf9bc82a7d47a1f60522761b23144861ffb5ebb4bf7a516a4e00a60a4e0d3af621f7618bb5c95ad7
7
+ data.tar.gz: 8afec02448e6552d5d84b9d296f69185d864c1617eb5643304a5e74132aa1b6d0262d3e2ca1f88bc682af2bb39320618ae7ae8a320b0617ac1b4883d6801909f
@@ -17,7 +17,7 @@ module PragmaticTokenizer
17
17
  private
18
18
 
19
19
  def remove_non_breaking_space!
20
- gsub!(Regex::NO_BREAK_SPACE, ''.freeze)
20
+ gsub!(Regex::NO_BREAK_SPACE, ' '.freeze)
21
21
  end
22
22
 
23
23
  def shift_various_characters!
@@ -27,6 +27,7 @@ module PragmaticTokenizer
27
27
  ASTERISK = /(?:\*+)/
28
28
  UNDERSCORE = /(?:_+)/
29
29
  HYPHEN_OR_UNDERSCORE = /(?:[-_])/
30
+ LONG_WORD_SPLIT = /(?:[-_\/—–])/
30
31
  PERIOD_AND_PRIOR = /(?:(.+\.))/
31
32
  PERIOD_ONLY = /(?:(\.))/
32
33
  CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
@@ -253,7 +253,7 @@ module PragmaticTokenizer
253
253
  return token if token.length <= @long_word_split
254
254
  return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
255
  return token if token =~ Regex::DOMAIN_OR_EMAIL
256
- token.split(Regex::HYPHEN_OR_UNDERSCORE)
256
+ token.split(Regex::LONG_WORD_SPLIT)
257
257
  end
258
258
 
259
259
  def chosen_case(text)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.1.0".freeze
2
+ VERSION = "3.2.1".freeze
3
3
  end
data/lib/unicode.rb ADDED
@@ -0,0 +1,9 @@
1
+ module Unicode
2
+ def self.downcase(text)
3
+ text.downcase
4
+ end
5
+
6
+ def self.capitalize(text)
7
+ text.capitalize
8
+ end
9
+ end
@@ -17,7 +17,6 @@ Gem::Specification.new do |spec|
17
17
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
18
  spec.require_paths = ["lib"]
19
19
 
20
- spec.add_runtime_dependency "unicode"
21
20
  spec.add_development_dependency "bundler", "~> 1.9"
22
21
  spec.add_development_dependency "rake", ">= 12.3.3"
23
22
  spec.add_development_dependency "rspec"
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
88
88
  expect(pt.tokenize(text)).to eq(["#ab-cd"])
89
89
  end
90
90
 
91
+ it 'tokenizes a string #015' do
92
+ text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
93
+ pt = PragmaticTokenizer::Tokenizer.new
94
+ expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
95
+ end
96
+
91
97
  it 'handles numbers with symbols 2' do
92
98
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
93
99
  pt = PragmaticTokenizer::Tokenizer.new
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
543
549
  )
544
550
  expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
545
551
  end
552
+ it 'tokenizes something with a slash' do
553
+ text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
554
+ pt = PragmaticTokenizer::Tokenizer.new(
555
+ long_word_split: 1
556
+ )
557
+ expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
558
+ end
546
559
  end
547
560
 
548
561
  context 'option (clean)' do
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-10 00:00:00.000000000 Z
11
+ date: 2024-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: unicode
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: bundler
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -142,6 +128,7 @@ files:
142
128
  - lib/pragmatic_tokenizer/regex.rb
143
129
  - lib/pragmatic_tokenizer/tokenizer.rb
144
130
  - lib/pragmatic_tokenizer/version.rb
131
+ - lib/unicode.rb
145
132
  - pragmatic_tokenizer.gemspec
146
133
  - spec/languages/bulgarian_spec.rb
147
134
  - spec/languages/deutsch_spec.rb
@@ -153,7 +140,7 @@ files:
153
140
  homepage: https://github.com/diasks2/pragmatic_tokenizer
154
141
  licenses: []
155
142
  metadata: {}
156
- post_install_message:
143
+ post_install_message:
157
144
  rdoc_options: []
158
145
  require_paths:
159
146
  - lib
@@ -168,9 +155,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
155
  - !ruby/object:Gem::Version
169
156
  version: '0'
170
157
  requirements: []
171
- rubyforge_project:
172
- rubygems_version: 2.7.6
173
- signing_key:
158
+ rubygems_version: 3.3.26
159
+ signing_key:
174
160
  specification_version: 4
175
161
  summary: A multilingual tokenizer
176
162
  test_files: